Java Code Examples for org.kitesdk.data.DatasetDescriptor#getFormat()
The following examples show how to use
org.kitesdk.data.DatasetDescriptor#getFormat() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0 | 6 votes |
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } }
Example 2
Source File: FileSystemWriter.java From kite with Apache License 2.0 | 6 votes |
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, long rollIntervalMillis, long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } }
Example 3
Source File: FileSystemWriter.java From kite with Apache License 2.0 | 4 votes |
static boolean isSupportedFormat(DatasetDescriptor descriptor) { Format format = descriptor.getFormat(); return (SUPPORTED_FORMATS.contains(format) || (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor) )); }
Example 4
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 4 votes |
@Test public void testIncompatibleFormatFilesInSameFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create Avro and Parquet files under separate folders, with the same schema Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createParquetUserFile(fs, parent); Collection<DatasetDescriptor> descriptors = FileSystemUtil .findPotentialDatasets(fs, root); Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size()); DatasetDescriptor avro; DatasetDescriptor parquet; DatasetDescriptor first = Iterables.getFirst(descriptors, null); if (first.getFormat() == Formats.AVRO) { avro = first; parquet = Iterables.getLast(descriptors, null); } else { parquet = first; avro = Iterables.getLast(descriptors, null); } Assert.assertFalse("Should not flag at mixed depth", avro.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), parent(avro.getLocation())); Assert.assertTrue("Should be a .avro file", avro.getLocation().toString().endsWith(".avro")); Assert.assertEquals("Should use user schema", USER_SCHEMA, avro.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, avro.getFormat()); Assert.assertFalse("Should not be partitioned", avro.isPartitioned()); Assert.assertFalse("Should not flag at mixed depth", parquet.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), parent(parquet.getLocation())); Assert.assertTrue("Should be a .parquet file", parquet.getLocation().toString().endsWith(".parquet")); Assert.assertEquals("Should use user schema", USER_SCHEMA, parquet.getSchema()); Assert.assertEquals("Should have Avro format", Formats.PARQUET, parquet.getFormat()); Assert.assertFalse("Should not be partitioned", parquet.isPartitioned()); }
Example 5
Source File: HiveUtils.java From kite with Apache License 2.0 | 4 votes |
static Table tableForDescriptor(String namespace, String name, DatasetDescriptor descriptor, boolean external, boolean includeSchema) { final Table table = createEmptyTable(namespace, name); if (external) { // you'd think this would do it... table.setTableType(TableType.EXTERNAL_TABLE.toString()); // but it doesn't work without some additional magic: table.getParameters().put("EXTERNAL", "TRUE"); table.getSd().setLocation(descriptor.getLocation().toString()); } else { table.setTableType(TableType.MANAGED_TABLE.toString()); } addPropertiesForDescriptor(table, descriptor); // translate from Format to SerDe final Format format = descriptor.getFormat(); if (FORMAT_TO_SERDE.containsKey(format)) { table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format)); table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format)); table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format)); } else { throw new UnknownFormatException( "No known serde for format:" + format.getName()); } if (includeSchema) { URL schemaURL = descriptor.getSchemaUrl(); if (useSchemaURL(schemaURL)) { table.getParameters().put( AVRO_SCHEMA_URL_PROPERTY_NAME, descriptor.getSchemaUrl().toExternalForm()); } else { table.getParameters().put( AVRO_SCHEMA_LITERAL_PROPERTY_NAME, descriptor.getSchema().toString()); } } table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME, descriptor.getCompressionType().getName()); // convert the schema to Hive columns table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema())); // copy partitioning info if (descriptor.isPartitioned()) { PartitionStrategy ps = descriptor.getPartitionStrategy(); table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME, Accessor.getDefault().toExpression(ps)); table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema())); } return table; }