Java Code Examples for org.kitesdk.data.DatasetDescriptor#getSchema()
The following examples show how to use
org.kitesdk.data.DatasetDescriptor#getSchema() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 6 votes |
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
Example 2
Source File: Compatibility.java From kite with Apache License 2.0 | 6 votes |
/** * Checks that the {@code existing} {@link DatasetDescriptor} is compatible * with {@code test}. * * @param existing the current {@code DatasetDescriptor} for a dataset * @param test a new {@code DatasetDescriptor} for the same dataset */ public static void checkCompatible(DatasetDescriptor existing, DatasetDescriptor test) { checkNotChanged("format", existing.getFormat(), test.getFormat()); checkNotChanged("partitioning", existing.isPartitioned(), test.isPartitioned()); if (existing.isPartitioned()) { checkStrategyUpdate( existing.getPartitionStrategy(), test.getPartitionStrategy(), test.getSchema()); } // check can read records written with old schema using new schema Schema oldSchema = existing.getSchema(); Schema testSchema = test.getSchema(); if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) { throw new IncompatibleSchemaException("Schema cannot read data " + "written using existing schema. Schema: " + testSchema.toString(true) + "\nExisting schema: " + oldSchema.toString(true)); } }
Example 3
Source File: TestGetSchema.java From nifi with Apache License 2.0 | 5 votes |
@Test public void testSchemaFromResourceURI() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar .build(); Schema expected = descriptor.getSchema(); Schema schema = AbstractKiteProcessor.getSchema( "resource:schema/user.avsc", DefaultConfiguration.get()); Assert.assertEquals("Schema from resource URI should match", expected, schema); }
Example 4
Source File: HBaseMetadataProvider.java From kite with Apache License 2.0 | 5 votes |
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) { // the SchemaManager stores schemas, so this embeds the column mapping and // partition strategy in the schema. the result is parsed by // AvroKeyEntitySchemaParser Schema schema = descriptor.getSchema(); if (descriptor.isColumnMapped()) { schema = ColumnMappingParser .embedColumnMapping(schema, descriptor.getColumnMapping()); } if (descriptor.isPartitioned()) { schema = PartitionStrategyParser .embedPartitionStrategy(schema, descriptor.getPartitionStrategy()); } return schema; }
Example 5
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroEntitySchema parseEntitySchema(String rawSchema, ColumnMapping columnMapping) { // use DatasetDescriptor.Builder because it checks consistency DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .columnMapping(columnMapping) .build(); return new AvroEntitySchema( descriptor.getSchema(), rawSchema, descriptor.getColumnMapping()); }
Example 6
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroEntitySchema parseEntitySchema(String rawSchema) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .build(); return new AvroEntitySchema( descriptor.getSchema(), rawSchema, descriptor.getColumnMapping()); }
Example 7
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroKeySchema parseKeySchema(String rawSchema, PartitionStrategy partitionStrategy) { // use DatasetDescriptor.Builder because it checks consistency DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .partitionStrategy(partitionStrategy) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
Example 8
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroKeySchema parseKeySchema(String rawSchema) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
Example 9
Source File: DaoDataset.java From kite with Apache License 2.0 | 5 votes |
public DaoDataset(String namespace, String name, Dao<E> dao, DatasetDescriptor descriptor, URI uri, Class<E> type) { super(type, descriptor.getSchema()); Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) || type == Object.class, "HBase only supports the generic and specific data models. The entity" + " type must implement IndexedRecord"); this.namespace = namespace; this.name = name; this.dao = dao; this.descriptor = descriptor; this.uri = uri; this.unbounded = new DaoView<E>(this, type); }
Example 10
Source File: AbstractRefinableView.java From kite with Apache License 2.0 | 5 votes |
protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) { this.dataset = dataset; final DatasetDescriptor descriptor = dataset.getDescriptor(); if (descriptor.isPartitioned()) { this.constraints = new Constraints( descriptor.getSchema(), descriptor.getPartitionStrategy()); // TODO: is comparator used anywhere? this.comparator = new MarkerComparator(descriptor.getPartitionStrategy()); this.keys = new ThreadLocal<StorageKey>() { @Override protected StorageKey initialValue() { return new StorageKey(descriptor.getPartitionStrategy()); } }; } else { this.constraints = new Constraints(descriptor.getSchema()); this.comparator = null; this.keys = null; } this.accessor = DataModelUtil.accessor(type, descriptor.getSchema()); this.entityTest = constraints.toEntityPredicate(accessor); Schema datasetSchema = descriptor.getSchema(); this.canRead = SchemaValidationUtil.canRead( datasetSchema, accessor.getReadSchema()); this.canWrite = SchemaValidationUtil.canRead( accessor.getWriteSchema(), datasetSchema); IncompatibleSchemaException.check(canRead || canWrite, "The type cannot be used to read from or write to the dataset:\n" + "Type schema: %s\nDataset schema: %s", getSchema(), descriptor.getSchema()); }
Example 11
Source File: FileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
FileSystemDataset(FileSystem fileSystem, Path directory, String namespace, String name, DatasetDescriptor descriptor, URI uri, @Nullable PartitionListener partitionListener, Class<E> type) { super(type, descriptor.getSchema()); if (Formats.PARQUET.equals(descriptor.getFormat())) { Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) || type == Object.class, "Parquet only supports generic and specific data models, type" + " parameter must implement IndexedRecord"); } this.fileSystem = fileSystem; this.directory = directory; this.namespace = namespace; this.name = name; this.descriptor = descriptor; this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null; this.partitionListener = partitionListener; this.convert = new PathConversion(descriptor.getSchema()); this.uri = uri; Path signalsPath = new Path(getDirectory(fileSystem, directory), SIGNALS_DIRECTORY_NAME); this.signalManager = new SignalManager(fileSystem, signalsPath); this.unbounded = new FileSystemPartitionView<E>( this, partitionListener, signalManager, type); // remove this.partitionKey for 0.14.0 this.partitionKey = null; }
Example 12
Source File: CSVAppender.java From kite with Apache License 2.0 | 5 votes |
public CSVAppender(FileSystem fs, Path path, DatasetDescriptor descriptor) { this.fs = fs; this.path = path; this.schema = descriptor.getSchema(); Preconditions.checkState(schema.getType() == Schema.Type.RECORD, "Unsupported schema (not a record): {}", schema); this.props = CSVProperties.fromDescriptor(descriptor); }
Example 13
Source File: FileSystemView.java From kite with Apache License 2.0 | 5 votes |
private FileSystemPartitionIterator partitionIterator() { DatasetDescriptor descriptor = dataset.getDescriptor(); try { return new FileSystemPartitionIterator( fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(), getKeyPredicate()); } catch (IOException ex) { throw new DatasetException("Cannot list partitions in view:" + this, ex); } }
Example 14
Source File: DatasetSink.java From kite with Apache License 2.0 | 5 votes |
private DatasetWriter<GenericRecord> newWriter( final UserGroupInformation login, final URI uri) { View<GenericRecord> view = KerberosUtil.runPrivileged(login, new PrivilegedExceptionAction<Dataset<GenericRecord>>() { @Override public Dataset<GenericRecord> run() { return Datasets.load(uri); } }); DatasetDescriptor descriptor = view.getDataset().getDescriptor(); String formatName = descriptor.getFormat().getName(); Preconditions.checkArgument(allowedFormats().contains(formatName), "Unsupported format: " + formatName); Schema newSchema = descriptor.getSchema(); if (targetSchema == null || !newSchema.equals(targetSchema)) { this.targetSchema = descriptor.getSchema(); // target dataset schema has changed, invalidate all readers based on it readers.invalidateAll(); } this.reuseDatum = !("parquet".equals(formatName)); this.datasetName = view.getDataset().getName(); return view.newWriter(); }
Example 15
Source File: CreateHiveUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .build(); Dataset<Record> users = Datasets.create("dataset:hive?dataset=users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 16
Source File: CreateUserDatasetGenericParquet.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .format(Formats.PARQUET) .build(); Dataset<Record> users = Datasets.create( "dataset:hdfs:/tmp/data/users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 17
Source File: CreateUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .build(); Dataset<Record> users = Datasets.create( "dataset:hdfs:/tmp/data/users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 18
Source File: CreateUserDatasetGenericPartitioned.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { // Create a partition strategy that hash partitions on username with 10 buckets PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .identity("favoriteColor", "favorite_color") .build(); // Create a dataset of users with the Avro schema DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:user.avsc") .partitionStrategy(partitionStrategy) .build(); Dataset<Record> users = Datasets.create( "dataset:hdfs:/tmp/data/users", descriptor, Record.class); // Get a writer for the dataset and write some users to it DatasetWriter<Record> writer = null; try { writer = users.newWriter(); Random rand = new Random(); GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema()); for (int i = 0; i < 100; i++) { Record record = builder.set("username", "user-" + i) .set("creationDate", System.currentTimeMillis()) .set("favoriteColor", colors[rand.nextInt(colors.length)]).build(); writer.write(record); } } finally { if (writer != null) { writer.close(); } } return 0; }
Example 19
Source File: TestGetSchema.java From localization_nifi with Apache License 2.0 | 5 votes |
@Test public void testSchemaFromResourceURI() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar .build(); Schema expected = descriptor.getSchema(); Schema schema = AbstractKiteProcessor.getSchema( "resource:schema/user.avsc", DefaultConfiguration.get()); Assert.assertEquals("Schema from resource URI should match", expected, schema); }
Example 20
Source File: HiveUtils.java From kite with Apache License 2.0 | 4 votes |
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) { URL schemaURL = descriptor.getSchemaUrl(); if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) { if (useSchemaURL(schemaURL)) { table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME); table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) { if (schemaURL == null) { throw new DatasetOperationException( "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME + " since descriptor schema URL is not set."); } table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else { // neither the literal or the URL are set, so add the URL if specified // and the schema literal if not. if (useSchemaURL(schemaURL)) { table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else if (descriptor.getSchema() != null) { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } else { throw new DatasetException("Table schema cannot be updated since it is" + " not set on the descriptor."); } } // copy partitioning info if (descriptor.isPartitioned()) { PartitionStrategy ps = descriptor.getPartitionStrategy(); table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME, Accessor.getDefault().toExpression(ps)); // no need to set the partition columns; no changes to the Hive side } // keep the custom properties up-to-date addPropertiesForDescriptor(table, descriptor); // keep the table DDL up to-to-date with the Schema table.getSd().setCols( HiveSchemaConverter.convertSchema(descriptor.getSchema())); }