org.kitesdk.data.DatasetDescriptor#getPartitionStrategy

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}

Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0

5 votes

private PartitionedDatasetWriter(FileSystemView<E> view) {
  final DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Preconditions.checkArgument(descriptor.isPartitioned(),
      "Dataset " + view.getDataset() + " is not partitioned");

  this.view = view;
  this.partitionStrategy = descriptor.getPartitionStrategy();

  int defaultMaxWriters = partitionStrategy.getCardinality();
  if (defaultMaxWriters < 0 || defaultMaxWriters > DEFAULT_WRITER_CACHE_SIZE) {
    defaultMaxWriters = DEFAULT_WRITER_CACHE_SIZE;
  }
  this.maxWriters = DescriptorUtil.getInt(WRITER_CACHE_SIZE_PROP, descriptor,
      defaultMaxWriters);

  this.state = ReaderWriterState.NEW;
  this.reusedKey = new StorageKey(partitionStrategy);
  this.accessor = view.getAccessor();
  this.provided = view.getProvidedValues();

  // get file rolling properties
  if (!Formats.PARQUET.equals(descriptor.getFormat())) {
    this.targetFileSize = DescriptorUtil.getLong(
        TARGET_FILE_SIZE_PROP, descriptor, -1);
  } else {
    targetFileSize = -1;
  }
  this.rollIntervalMillis = 1000 * DescriptorUtil.getLong(
      ROLL_INTERVAL_S_PROP, descriptor, -1);
}

Source File: FileSystemView.java From kite with Apache License 2.0

5 votes

private FileSystemPartitionIterator partitionIterator() {
  DatasetDescriptor descriptor = dataset.getDescriptor();
  try {
    return new FileSystemPartitionIterator(
        fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(),
        getKeyPredicate());
  } catch (IOException ex) {
    throw new DatasetException("Cannot list partitions in view:" + this, ex);
  }
}

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}

Source File: AbstractRefinableView.java From kite with Apache License 2.0

5 votes

protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema,
    PartitionStrategy partitionStrategy) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .partitionStrategy(partitionStrategy)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Source File: HBaseMetadataProviderTest.java From kite with Apache License 2.0

5 votes

@Test
public void testBasic() {
  DatasetDescriptor desc = provider.create("default", tableName + ".TestEntity",
      new DatasetDescriptor.Builder().schemaLiteral(testEntity).build());
  ColumnMapping columnMapping = desc.getColumnMapping();
  PartitionStrategy partStrat = desc.getPartitionStrategy();
  assertEquals(9, columnMapping.getFieldMappings().size());
  assertEquals(2, Accessor.getDefault().getFieldPartitioners(partStrat).size());
}

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

static Table tableForDescriptor(String namespace, String name,
                                DatasetDescriptor descriptor,
                                boolean external,
                                boolean includeSchema) {
  final Table table = createEmptyTable(namespace, name);

  if (external) {
    // you'd think this would do it...
    table.setTableType(TableType.EXTERNAL_TABLE.toString());
    // but it doesn't work without some additional magic:
    table.getParameters().put("EXTERNAL", "TRUE");
    table.getSd().setLocation(descriptor.getLocation().toString());
  } else {
    table.setTableType(TableType.MANAGED_TABLE.toString());
  }

  addPropertiesForDescriptor(table, descriptor);

  // translate from Format to SerDe
  final Format format = descriptor.getFormat();
  if (FORMAT_TO_SERDE.containsKey(format)) {
    table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format));
    table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format));
    table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format));
  } else {
    throw new UnknownFormatException(
        "No known serde for format:" + format.getName());
  }

  if (includeSchema) {
    URL schemaURL = descriptor.getSchemaUrl();
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          descriptor.getSchemaUrl().toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }
  }

  table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME,
      descriptor.getCompressionType().getName());

  // convert the schema to Hive columns
  table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema()));

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema()));
  }

  return table;
}

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  URL schemaURL = descriptor.getSchemaUrl();

  if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    if (useSchemaURL(schemaURL)) {
      table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
      table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }

  } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (schemaURL == null) {
      throw new DatasetOperationException(
          "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " since descriptor schema URL is not set.");
    }
    table.getParameters().put(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        schemaURL.toExternalForm());

  } else {
    // neither the literal or the URL are set, so add the URL if specified
    // and the schema literal if not.
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
              AVRO_SCHEMA_URL_PROPERTY_NAME,
              schemaURL.toExternalForm());

    } else if (descriptor.getSchema() != null) {
      table.getParameters().put(
              AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
              descriptor.getSchema().toString());
    } else {
      throw new DatasetException("Table schema cannot be updated since it is" +
              " not set on the descriptor.");
    }
  }

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    // no need to set the partition columns; no changes to the Hive side
  }

  // keep the custom properties up-to-date
  addPropertiesForDescriptor(table, descriptor);

  // keep the table DDL up to-to-date with the Schema
  table.getSd().setCols(
      HiveSchemaConverter.convertSchema(descriptor.getSchema()));
}

Java Code Examples for org.kitesdk.data.DatasetDescriptor#getPartitionStrategy()