org.kitesdk.data.Formats Java Examples
The following examples show how to use
org.kitesdk.data.Formats.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
Example #2
Source File: AvroAppender.java From kite with Apache License 2.0 | 6 votes |
private CodecFactory getCodecFactory() { switch (compressionType) { case Snappy: return CodecFactory.snappyCodec(); case Deflate: return CodecFactory.deflateCodec(9); case Bzip2: return CodecFactory.bzip2Codec(); default: throw new IllegalArgumentException(String.format( "Unsupported compression format %s. Supported formats: %s", compressionType.getName(), Arrays.toString( Formats.AVRO.getSupportedCompressionTypes().toArray()))); } }
Example #3
Source File: FileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Override Result file(FileSystem fs, Path path) throws IOException { Format format = formatFromExt(path); Schema schema = null; if (format == Formats.AVRO) { schema = Schemas.fromAvro(fs, path); } else if (format == Formats.PARQUET) { schema = Schemas.fromParquet(fs, path); } else if (format == Formats.JSON) { schema = Schemas.fromJSON("record", fs, path); } if (schema == null) { return new Result.Unknown(); } return new Result.Table(path, format, schema, path.depth()); }
Example #4
Source File: MultiFileDatasetReader.java From kite with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106 private void openNextReader() { if (Formats.PARQUET.equals(descriptor.getFormat())) { this.reader = new ParquetFileSystemDatasetReader(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } else if (Formats.JSON.equals(descriptor.getFormat())) { this.reader = new JSONFileReader<E>( fileSystem, filesIter.next(), accessor); } else if (Formats.CSV.equals(descriptor.getFormat())) { this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(), descriptor, accessor); } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) { this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor); } else { this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(), accessor.getReadSchema(), accessor.getType()); } reader.initialize(); this.readerIterator = Iterators.filter(reader, constraints.toEntityPredicate( (pathIter != null ? pathIter.getStorageKey() : null), accessor)); }
Example #5
Source File: ParquetAppender.java From kite with Apache License 2.0 | 6 votes |
private CompressionCodecName getCompressionCodecName() { switch (compressionType) { case Snappy: return CompressionCodecName.SNAPPY; case Lzo: return CompressionCodecName.LZO; case Deflate: return CompressionCodecName.GZIP; default: throw new IllegalArgumentException(String.format( "Unsupported compression format %s. Supported formats: %s", compressionType.getName(), Arrays.toString( Formats.PARQUET.getSupportedCompressionTypes().toArray()))); } }
Example #6
Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0 | 6 votes |
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } }
Example #7
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSingleParquetFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .parquet file", descriptor.getLocation().toString().endsWith(".parquet")); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
Example #8
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
Example #9
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #10
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
Example #11
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGenericParquet() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #12
Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0 | 6 votes |
@Test public void testUpdateFailsWithFormatChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .format(Formats.AVRO) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .format(Formats.PARQUET) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to format change"); } catch (ValidationException e) { // expected } Assert.assertEquals( Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat()); }
Example #13
Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0 | 6 votes |
@Test public void testCreateFromExistingPartitioned() throws Exception { command.datasets = Lists.newArrayList(existingPartitionedURI); command.run(); verify(console).debug(contains("Created"), eq(existingPartitionedURI)); PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder() .provided("version", "int") .build(); // load the new dataset and verify it Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI); Assert.assertEquals("Schema should match", USER_SCHEMA, users.getDescriptor().getSchema()); Assert.assertEquals("Should be partitioned with a provided partitioner", providedVersionStrategy, users.getDescriptor().getPartitionStrategy()); Assert.assertEquals("Should be Parquet", Formats.PARQUET, users.getDescriptor().getFormat()); }
Example #14
Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0 | 6 votes |
@Test public void testCreateFromExistingWithLocation() throws Exception { command.datasets = Lists.newArrayList(existingDataURI); command.location = existingPartitionedPathWithPartition.toString(); command.run(); verify(console).debug(contains("Created"), eq(existingDataURI)); // load the new dataset and verify it Dataset<GenericRecord> users = Datasets.load(existingDataURI); Assert.assertEquals("Schema should match", USER_SCHEMA, users.getDescriptor().getSchema()); Assert.assertFalse("Should not be partitioned", users.getDescriptor().isPartitioned()); Assert.assertEquals("Should be Parquet", Formats.PARQUET, users.getDescriptor().getFormat()); Assert.assertTrue("Location should point to the partitioned data", String.valueOf(users.getDescriptor().getLocation()) .endsWith(existingPartitionedPathWithPartition.toString())); }
Example #15
Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0 | 6 votes |
@Test public void testCreateFromExisting() throws Exception { command.datasets = Lists.newArrayList(existingDataURI); command.run(); verify(console).debug(contains("Created"), eq(existingDataURI)); // load the new dataset and verify it Dataset<GenericRecord> users = Datasets.load(existingDataURI); Assert.assertEquals("Schema should match", USER_SCHEMA, users.getDescriptor().getSchema()); Assert.assertFalse("Should not be partitioned", users.getDescriptor().isPartitioned()); Assert.assertEquals("Should be Parquet", Formats.PARQUET, users.getDescriptor().getFormat()); }
Example #16
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSingleAvroFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .avro file", descriptor.getLocation().toString().endsWith(".avro")); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
Example #17
Source File: FileSystemWriter.java From kite with Apache License 2.0 | 6 votes |
@VisibleForTesting @SuppressWarnings("unchecked") <E> FileAppender<E> newAppender(Path temp) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return (FileAppender<E>) new DurableParquetAppender( fs, temp, schema, conf, descriptor.getCompressionType()); } else { return (FileAppender<E>) new ParquetAppender( fs, temp, schema, conf, descriptor.getCompressionType()); } } else if (Formats.AVRO.equals(format)) { return new AvroAppender<E>(fs, temp, schema, descriptor.getCompressionType()); } else if (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) { return new CSVAppender<E>(fs, temp, descriptor); } else { this.state = ReaderWriterState.ERROR; throw new UnknownFormatException("Unknown format " + descriptor); } }
Example #18
Source File: FileSystemWriter.java From kite with Apache License 2.0 | 6 votes |
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, long rollIntervalMillis, long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled( FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } else { return new FileSystemWriter<E>( fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema); } }
Example #19
Source File: TestFileSystemDataset.java From kite with Apache License 2.0 | 6 votes |
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.AVRO) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.PARQUET) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
Example #20
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testPartitionedDatasetWithEscapedChars() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .provided("s") .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset.with("s", "test/-0")); writeUserToView(dataset.with("s", "test/-0")); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "s=test%2F-0"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", URI.create(partitionPath.toString()), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
Example #21
Source File: FileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
FileSystemDataset(FileSystem fileSystem, Path directory, String namespace, String name, DatasetDescriptor descriptor, URI uri, @Nullable PartitionListener partitionListener, Class<E> type) { super(type, descriptor.getSchema()); if (Formats.PARQUET.equals(descriptor.getFormat())) { Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) || type == Object.class, "Parquet only supports generic and specific data models, type" + " parameter must implement IndexedRecord"); } this.fileSystem = fileSystem; this.directory = directory; this.namespace = namespace; this.name = name; this.descriptor = descriptor; this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null; this.partitionListener = partitionListener; this.convert = new PathConversion(descriptor.getSchema()); this.uri = uri; Path signalsPath = new Path(getDirectory(fileSystem, directory), SIGNALS_DIRECTORY_NAME); this.signalManager = new SignalManager(fileSystem, signalsPath); this.unbounded = new FileSystemPartitionView<E>( this, partitionListener, signalManager, type); // remove this.partitionKey for 0.14.0 this.partitionKey = null; }
Example #22
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMultipleAvroFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
Example #23
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMultipleAvroFilesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
Example #24
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMultipleMergeTablesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
Example #25
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 5 votes |
@Test public void testMultipleParquetFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createParquetEventFile(fs, new Path(parent, "part")); createParquetEventFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("partition_1", "string") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
Example #26
Source File: TestMetadataProviders.java From kite with Apache License 2.0 | 5 votes |
@Before public void setUp() throws IOException, URISyntaxException { this.conf = (distributed ? MiniDFSTest.getConfiguration() : new Configuration()); this.testDescriptor = new DatasetDescriptor.Builder() .format(Formats.AVRO) .schema(SchemaBuilder.record("Event").fields() .requiredLong("timestamp") .requiredString("message") .endRecord()) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .build()) .build(); // something completely different this.anotherDescriptor = new DatasetDescriptor.Builder() .format(Formats.PARQUET) .schema(SchemaBuilder.record("Record").fields() .requiredBytes("some_field") .requiredString("another_field") .endRecord()) .partitionStrategy(new PartitionStrategy.Builder() .hash("some_field", 20000) .build()) .build(); this.provider = newProvider(conf); }
Example #27
Source File: TestMetadataProviders.java From kite with Apache License 2.0 | 5 votes |
@Test public void testLargeSchema() { // Only run this test in distributed mode, since non-HDFS schema URLs result // in the schema being loaded into the Hive metastore, and large schemas // can exceed the size limit of that. Assume.assumeTrue(distributed); Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, "large_schema_test")); // Create a schema with many fields to ensure the underlying store can handle it. SchemaBuilder.FieldAssembler<Schema> fields = SchemaBuilder.record("Event").fields(); for (int i = 0; i < 1000; ++i) { fields.requiredString("field_" + i); } DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .format(Formats.AVRO) .schema(fields.endRecord()) .build(); DatasetDescriptor created = provider.create(NAMESPACE, "large_schema_test", descriptor); Assert.assertEquals("Large schemas should match", descriptor.getSchema(), created.getSchema()); }
Example #28
Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0 | 5 votes |
@Test public void testCreateFailsIfNotCompatible() { // this will fail because the new descriptor uses a different format // the old descriptor is found and used to validate the change TestHelpers.assertThrows("Create should fail because of a format change", ValidationException.class, new Runnable() { @Override public void run() { Datasets.create("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .format(Formats.PARQUET) .build()); } }); }
Example #29
Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0 | 5 votes |
@Test public void testCreateIncompatibleSucceedsWithLocation() { // if there is a requested location then the default table isn't checked // because only the default location would have been used Assert.assertNotNull("Create should succeed if location doesn't match", Datasets.create("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .location(URI.create("file:/tmp/test-data/test")) .format(Formats.PARQUET) .build())); }
Example #30
Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0 | 5 votes |
@Test public void testUpdateValidatesAgainstDefaultNamespace() { TestHelpers.assertThrows("Update should fail because of a format change", ValidationException.class, new Runnable() { @Override public void run() { Datasets.update("dataset:hive:/tmp/datasets/test", new DatasetDescriptor.Builder(descriptor) .format(Formats.PARQUET) .build()); } }); }