Java Code Examples for org.kitesdk.data.DatasetDescriptor#getPartitionStrategy()
The following examples show how to use
org.kitesdk.data.DatasetDescriptor#getPartitionStrategy() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 6 votes |
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
Example 2
Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0 | 5 votes |
private PartitionedDatasetWriter(FileSystemView<E> view) { final DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Preconditions.checkArgument(descriptor.isPartitioned(), "Dataset " + view.getDataset() + " is not partitioned"); this.view = view; this.partitionStrategy = descriptor.getPartitionStrategy(); int defaultMaxWriters = partitionStrategy.getCardinality(); if (defaultMaxWriters < 0 || defaultMaxWriters > DEFAULT_WRITER_CACHE_SIZE) { defaultMaxWriters = DEFAULT_WRITER_CACHE_SIZE; } this.maxWriters = DescriptorUtil.getInt(WRITER_CACHE_SIZE_PROP, descriptor, defaultMaxWriters); this.state = ReaderWriterState.NEW; this.reusedKey = new StorageKey(partitionStrategy); this.accessor = view.getAccessor(); this.provided = view.getProvidedValues(); // get file rolling properties if (!Formats.PARQUET.equals(descriptor.getFormat())) { this.targetFileSize = DescriptorUtil.getLong( TARGET_FILE_SIZE_PROP, descriptor, -1); } else { targetFileSize = -1; } this.rollIntervalMillis = 1000 * DescriptorUtil.getLong( ROLL_INTERVAL_S_PROP, descriptor, -1); }
Example 3
Source File: FileSystemView.java From kite with Apache License 2.0 | 5 votes |
private FileSystemPartitionIterator partitionIterator() { DatasetDescriptor descriptor = dataset.getDescriptor(); try { return new FileSystemPartitionIterator( fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(), getKeyPredicate()); } catch (IOException ex) { throw new DatasetException("Cannot list partitions in view:" + this, ex); } }
Example 4
Source File: FileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
FileSystemDataset(FileSystem fileSystem, Path directory, String namespace, String name, DatasetDescriptor descriptor, URI uri, @Nullable PartitionListener partitionListener, Class<E> type) { super(type, descriptor.getSchema()); if (Formats.PARQUET.equals(descriptor.getFormat())) { Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) || type == Object.class, "Parquet only supports generic and specific data models, type" + " parameter must implement IndexedRecord"); } this.fileSystem = fileSystem; this.directory = directory; this.namespace = namespace; this.name = name; this.descriptor = descriptor; this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null; this.partitionListener = partitionListener; this.convert = new PathConversion(descriptor.getSchema()); this.uri = uri; Path signalsPath = new Path(getDirectory(fileSystem, directory), SIGNALS_DIRECTORY_NAME); this.signalManager = new SignalManager(fileSystem, signalsPath); this.unbounded = new FileSystemPartitionView<E>( this, partitionListener, signalManager, type); // remove this.partitionKey for 0.14.0 this.partitionKey = null; }
Example 5
Source File: AbstractRefinableView.java From kite with Apache License 2.0 | 5 votes |
protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) { this.dataset = dataset; final DatasetDescriptor descriptor = dataset.getDescriptor(); if (descriptor.isPartitioned()) { this.constraints = new Constraints( descriptor.getSchema(), descriptor.getPartitionStrategy()); // TODO: is comparator used anywhere? this.comparator = new MarkerComparator(descriptor.getPartitionStrategy()); this.keys = new ThreadLocal<StorageKey>() { @Override protected StorageKey initialValue() { return new StorageKey(descriptor.getPartitionStrategy()); } }; } else { this.constraints = new Constraints(descriptor.getSchema()); this.comparator = null; this.keys = null; } this.accessor = DataModelUtil.accessor(type, descriptor.getSchema()); this.entityTest = constraints.toEntityPredicate(accessor); Schema datasetSchema = descriptor.getSchema(); this.canRead = SchemaValidationUtil.canRead( datasetSchema, accessor.getReadSchema()); this.canWrite = SchemaValidationUtil.canRead( accessor.getWriteSchema(), datasetSchema); IncompatibleSchemaException.check(canRead || canWrite, "The type cannot be used to read from or write to the dataset:\n" + "Type schema: %s\nDataset schema: %s", getSchema(), descriptor.getSchema()); }
Example 6
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroKeySchema parseKeySchema(String rawSchema) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
Example 7
Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0 | 5 votes |
@Override public AvroKeySchema parseKeySchema(String rawSchema, PartitionStrategy partitionStrategy) { // use DatasetDescriptor.Builder because it checks consistency DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .partitionStrategy(partitionStrategy) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
Example 8
Source File: HBaseMetadataProviderTest.java From kite with Apache License 2.0 | 5 votes |
@Test public void testBasic() { DatasetDescriptor desc = provider.create("default", tableName + ".TestEntity", new DatasetDescriptor.Builder().schemaLiteral(testEntity).build()); ColumnMapping columnMapping = desc.getColumnMapping(); PartitionStrategy partStrat = desc.getPartitionStrategy(); assertEquals(9, columnMapping.getFieldMappings().size()); assertEquals(2, Accessor.getDefault().getFieldPartitioners(partStrat).size()); }
Example 9
Source File: HiveUtils.java From kite with Apache License 2.0 | 4 votes |
static Table tableForDescriptor(String namespace, String name, DatasetDescriptor descriptor, boolean external, boolean includeSchema) { final Table table = createEmptyTable(namespace, name); if (external) { // you'd think this would do it... table.setTableType(TableType.EXTERNAL_TABLE.toString()); // but it doesn't work without some additional magic: table.getParameters().put("EXTERNAL", "TRUE"); table.getSd().setLocation(descriptor.getLocation().toString()); } else { table.setTableType(TableType.MANAGED_TABLE.toString()); } addPropertiesForDescriptor(table, descriptor); // translate from Format to SerDe final Format format = descriptor.getFormat(); if (FORMAT_TO_SERDE.containsKey(format)) { table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format)); table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format)); table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format)); } else { throw new UnknownFormatException( "No known serde for format:" + format.getName()); } if (includeSchema) { URL schemaURL = descriptor.getSchemaUrl(); if (useSchemaURL(schemaURL)) { table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, descriptor.getSchemaUrl().toExternalForm()); } else { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } } table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME, descriptor.getCompressionType().getName()); // convert the schema to Hive columns table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema())); // copy partitioning info if (descriptor.isPartitioned()) { PartitionStrategy ps = descriptor.getPartitionStrategy(); table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME, Accessor.getDefault().toExpression(ps)); table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema())); } return table; }
Example 10
Source File: HiveUtils.java From kite with Apache License 2.0 | 4 votes |
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) { URL schemaURL = descriptor.getSchemaUrl(); if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) { if (useSchemaURL(schemaURL)) { table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME); table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) { if (schemaURL == null) { throw new DatasetOperationException( "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME + " since descriptor schema URL is not set."); } table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else { // neither the literal or the URL are set, so add the URL if specified // and the schema literal if not. if (useSchemaURL(schemaURL)) { table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, schemaURL.toExternalForm()); } else if (descriptor.getSchema() != null) { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } else { throw new DatasetException("Table schema cannot be updated since it is" + " not set on the descriptor."); } } // copy partitioning info if (descriptor.isPartitioned()) { PartitionStrategy ps = descriptor.getPartitionStrategy(); table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME, Accessor.getDefault().toExpression(ps)); // no need to set the partition columns; no changes to the Hive side } // keep the custom properties up-to-date addPropertiesForDescriptor(table, descriptor); // keep the table DDL up to-to-date with the Schema table.getSd().setCols( HiveSchemaConverter.convertSchema(descriptor.getSchema())); }