org.kitesdk.data.DatasetDescriptor#getSchema

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}

Source File: Compatibility.java From kite with Apache License 2.0

6 votes

/**
 * Checks that the {@code existing} {@link DatasetDescriptor} is compatible
 * with {@code test}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param test a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkCompatible(DatasetDescriptor existing,
                                   DatasetDescriptor test) {
  checkNotChanged("format", existing.getFormat(), test.getFormat());

  checkNotChanged("partitioning",
      existing.isPartitioned(), test.isPartitioned());

  if (existing.isPartitioned()) {
    checkStrategyUpdate(
        existing.getPartitionStrategy(),
        test.getPartitionStrategy(),
        test.getSchema());
  }

  // check can read records written with old schema using new schema
  Schema oldSchema = existing.getSchema();
  Schema testSchema = test.getSchema();
  if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
    throw new IncompatibleSchemaException("Schema cannot read data " +
        "written using existing schema. Schema: " + testSchema.toString(true) +
        "\nExisting schema: " + oldSchema.toString(true));
  }

}

Source File: TestGetSchema.java From nifi with Apache License 2.0

5 votes

@Test
public void testSchemaFromResourceURI() throws IOException {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar
            .build();
    Schema expected = descriptor.getSchema();

    Schema schema = AbstractKiteProcessor.getSchema(
            "resource:schema/user.avsc", DefaultConfiguration.get());

    Assert.assertEquals("Schema from resource URI should match",
            expected, schema);
}

Source File: HBaseMetadataProvider.java From kite with Apache License 2.0

5 votes

private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) {
  // the SchemaManager stores schemas, so this embeds the column mapping and
  // partition strategy in the schema. the result is parsed by
  // AvroKeyEntitySchemaParser
  Schema schema = descriptor.getSchema();
  if (descriptor.isColumnMapped()) {
    schema = ColumnMappingParser
        .embedColumnMapping(schema, descriptor.getColumnMapping());
  }
  if (descriptor.isPartitioned()) {
    schema = PartitionStrategyParser
        .embedPartitionStrategy(schema, descriptor.getPartitionStrategy());
  }
  return schema;
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroEntitySchema parseEntitySchema(String rawSchema,
    ColumnMapping columnMapping) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .columnMapping(columnMapping)
      .build();
  return new AvroEntitySchema(
      descriptor.getSchema(), rawSchema, descriptor.getColumnMapping());
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroEntitySchema parseEntitySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroEntitySchema(
      descriptor.getSchema(), rawSchema, descriptor.getColumnMapping());
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema,
    PartitionStrategy partitionStrategy) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .partitionStrategy(partitionStrategy)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Source File: DaoDataset.java From kite with Apache License 2.0

5 votes

public DaoDataset(String namespace, String name, Dao<E> dao, DatasetDescriptor descriptor,
    URI uri, Class<E> type) {
  super(type, descriptor.getSchema());
  Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
          type == Object.class,
      "HBase only supports the generic and specific data models. The entity"
          + " type must implement IndexedRecord");
  this.namespace = namespace;
  this.name = name;
  this.dao = dao;
  this.descriptor = descriptor;
  this.uri = uri;
  this.unbounded = new DaoView<E>(this, type);
}

Source File: AbstractRefinableView.java From kite with Apache License 2.0

5 votes

protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}

Source File: CSVAppender.java From kite with Apache License 2.0

5 votes

public CSVAppender(FileSystem fs, Path path, DatasetDescriptor descriptor) {
  this.fs = fs;
  this.path = path;
  this.schema = descriptor.getSchema();
  Preconditions.checkState(schema.getType() == Schema.Type.RECORD,
      "Unsupported schema (not a record): {}", schema);
  this.props = CSVProperties.fromDescriptor(descriptor);
}

Source File: FileSystemView.java From kite with Apache License 2.0

5 votes

private FileSystemPartitionIterator partitionIterator() {
  DatasetDescriptor descriptor = dataset.getDescriptor();
  try {
    return new FileSystemPartitionIterator(
        fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(),
        getKeyPredicate());
  } catch (IOException ex) {
    throw new DatasetException("Cannot list partitions in view:" + this, ex);
  }
}

Source File: DatasetSink.java From kite with Apache License 2.0

5 votes

private DatasetWriter<GenericRecord> newWriter(
    final UserGroupInformation login, final URI uri) {
  View<GenericRecord> view = KerberosUtil.runPrivileged(login,
      new PrivilegedExceptionAction<Dataset<GenericRecord>>() {
        @Override
        public Dataset<GenericRecord> run() {
          return Datasets.load(uri);
        }
      });

  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  String formatName = descriptor.getFormat().getName();
  Preconditions.checkArgument(allowedFormats().contains(formatName),
      "Unsupported format: " + formatName);

  Schema newSchema = descriptor.getSchema();
  if (targetSchema == null || !newSchema.equals(targetSchema)) {
    this.targetSchema = descriptor.getSchema();
    // target dataset schema has changed, invalidate all readers based on it
    readers.invalidateAll();
  }

  this.reuseDatum = !("parquet".equals(formatName));
  this.datasetName = view.getDataset().getName();

  return view.newWriter();
}

Source File: CreateHiveUserDatasetGeneric.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create("dataset:hive?dataset=users",
      descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }

  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateUserDatasetGenericParquet.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .format(Formats.PARQUET)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateUserDatasetGeneric.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateUserDatasetGenericPartitioned.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a partition strategy that hash partitions on username with 10 buckets
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("favoriteColor", "favorite_color")
      .build();

  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .partitionStrategy(partitionStrategy)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: TestGetSchema.java From localization_nifi with Apache License 2.0

5 votes

@Test
public void testSchemaFromResourceURI() throws IOException {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar
            .build();
    Schema expected = descriptor.getSchema();

    Schema schema = AbstractKiteProcessor.getSchema(
            "resource:schema/user.avsc", DefaultConfiguration.get());

    Assert.assertEquals("Schema from resource URI should match",
            expected, schema);
}

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  URL schemaURL = descriptor.getSchemaUrl();

  if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    if (useSchemaURL(schemaURL)) {
      table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
      table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }

  } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (schemaURL == null) {
      throw new DatasetOperationException(
          "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " since descriptor schema URL is not set.");
    }
    table.getParameters().put(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        schemaURL.toExternalForm());

  } else {
    // neither the literal or the URL are set, so add the URL if specified
    // and the schema literal if not.
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
              AVRO_SCHEMA_URL_PROPERTY_NAME,
              schemaURL.toExternalForm());

    } else if (descriptor.getSchema() != null) {
      table.getParameters().put(
              AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
              descriptor.getSchema().toString());
    } else {
      throw new DatasetException("Table schema cannot be updated since it is" +
              " not set on the descriptor.");
    }
  }

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    // no need to set the partition columns; no changes to the Hive side
  }

  // keep the custom properties up-to-date
  addPropertiesForDescriptor(table, descriptor);

  // keep the table DDL up to-to-date with the Schema
  table.getSd().setCols(
      HiveSchemaConverter.convertSchema(descriptor.getSchema()));
}

Java Code Examples for org.kitesdk.data.DatasetDescriptor#getSchema()