Java Code Examples for org.kitesdk.data.Dataset#getDescriptor()

The following examples show how to use org.kitesdk.data.Dataset#getDescriptor() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}

Example 2

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testUnpartitionedDataset() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset);
  writeUserToView(dataset);

  DatasetDescriptor expected = dataset.getDescriptor();
  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertEquals("Should succeed and find an equivalent descriptor",
      expected, actual);
}

Example 3

Source File: InfoCommand.java From kite with Apache License 2.0

5 votes

private static void printInfo(Logger console, Dataset<?> dataset) {
  DatasetDescriptor desc = dataset.getDescriptor();
  String schema = ColumnMappingParser.removeEmbeddedMapping(
      PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema()))
      .toString(true);
  Collection<String> properties = desc.listProperties();

  console.info("\nDataset \"{}\":", dataset.getName());
  console.info("\tURI: \"{}\"", dataset.getUri());
  console.info("\tSchema: {}", indent(schema));
  if (desc.isPartitioned()) {
    console.info("\tPartition strategy: {}",
        indent(desc.getPartitionStrategy().toString(true)));
  } else {
    console.info("\tNot partitioned");
  }
  if (desc.isColumnMapped()) {
    console.info("\tColumn mapping: {}",
        indent(desc.getColumnMapping().toString(true)));
  }
  if (!properties.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    for (String prop : properties) {
      sb.append("\n\t\t").append(prop).append("=")
          .append(desc.getProperty(prop));
    }
    console.info("\tProperties:{}", sb.toString());
  }
}

Example 4

Source File: AbstractRefinableView.java From kite with Apache License 2.0

5 votes

protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}

Example 5

Source File: UpdateDatasetCommand.java From kite with Apache License 2.0

4 votes

@Override
public int run() throws IOException {
  if (datasets == null || datasets.size() != 1) {
    throw new IllegalArgumentException(
        "Exactly one dataset name must be specified.");
  }

  String dataset = datasets.remove(0);
  Dataset<GenericRecord> currentDataset = load(dataset).getDataset();

  DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor
      .Builder(currentDataset.getDescriptor());

  if (avroSchemaFile != null) {
    descriptorBuilder.schemaUri(qualifiedURI(avroSchemaFile));
  }

  if (partitionStrategyFile != null) {
    descriptorBuilder.partitionStrategyUri(
        qualifiedURI(partitionStrategyFile));
  }

  if (properties != null) {
    for (String propValue : properties) {
      Iterator<String> parts = PROP_VALUE_SEP.split(propValue).iterator();
      descriptorBuilder.property(
          Iterators.getNext(parts, null),
          Iterators.getNext(parts, null));
    }
  }

  DatasetDescriptor descriptor = descriptorBuilder.build();

  if (isDatasetOrViewUri(dataset)) {
    Datasets.<GenericData.Record, Dataset<GenericData.Record>> update(dataset, descriptor, GenericData.Record.class);
  } else {
    getDatasetRepository().update(namespace, dataset, descriptor);
  }

  console.debug("Updated {}", dataset);

  return 0;
}