org.kitesdk.data.Dataset Java Examples
The following examples show how to use
org.kitesdk.data.Dataset.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #2
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); View<GenericRecord> inputView = inputDataset .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7")) .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7")); Assert.assertEquals(6, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 6, 2); }
Example #3
Source File: TestProjection.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSpecificProjectionLoad() throws IOException { DatasetWriter<StandardEvent> writer = null; try { writer = unbounded.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } Dataset<SmallEvent> dataset = repo.load( "ns", unbounded.getDataset().getName(), SmallEvent.class); Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent), toSmallEvent(octEvent), toSmallEvent(novEvent)); assertContentEquals(expected, dataset); }
Example #4
Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0 | 6 votes |
@Test public void testUpdateWithUpdatedURI() { Dataset<GenericRecord> updated = Datasets.update( "dataset:hive:/tmp/datasets/default/test", new DatasetDescriptor.Builder(descriptor) .property("added.property", "true") .build()); Assert.assertNotNull("Update should succeed", updated); DatasetDescriptor stored = HiveUtils.descriptorForTable(conf, metastore.getTable("default", "test")); Assert.assertEquals("Should update default.test descriptor", stored, updated.getDescriptor()); Assert.assertEquals("Added property should be present", stored.getProperty("added.property"), "true"); }
Example #5
Source File: TestHiveDatasetURIs.java From kite with Apache License 2.0 | 6 votes |
@Test public void testExternalHDFSQueryOptions() { DatasetRepository repo = DatasetRepositories .repositoryFor("repo:hive:/tmp/data?" + hdfsQueryArgs); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Object> ds = Datasets .<Object, Dataset<Object>>load("dataset:hive:/tmp/data/ns/test?" + hdfsQueryArgsOld, Object.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); repo.delete("ns", "test"); }
Example #6
Source File: ReadProductDatasetPojo.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the products dataset Dataset<Product> products = Datasets.load( "dataset:hdfs:/tmp/data/products", Product.class); // Get a reader for the dataset and read all the users DatasetReader<Product> reader = null; try { reader = products.newReader(); for (Product product : reader) { System.out.println(product); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #7
Source File: ReadDataset.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the events dataset Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events"); // Get a reader for the dataset and read all the events DatasetReader<GenericRecord> reader = events.newReader(); try { for (GenericRecord event : reader) { System.out.println(event); } } finally { reader.close(); } return 0; }
Example #8
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGenericParquet() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #9
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
Example #10
Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0 | 6 votes |
@SuppressWarnings("deprecation") @Test public void testNewPartitionIsVisibleToHive() throws Exception { final String NAME2 = "test2"; PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(testSchema) .partitionStrategy(partitionStrategy) .build(); Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor); HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2); HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2); Assert.assertTrue("No partitions yet", client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty()); writeRecord(dataset, 0); Assert.assertEquals("Should be one partition", 1, client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size()); }
Example #11
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSourceAndTarget() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, true); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #12
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #13
Source File: TestFileSystemUtil.java From kite with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
Example #14
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetViewProvidedPartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); View<Record> inputView = inputDataset.with("version", "test-version-0"); writeTestUsers(inputView, 1); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("version", "test-version-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #15
Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0 | 6 votes |
@Test public void testUpdateFailsWithLocationChange() { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); URI location = dataset.getDescriptor().getLocation(); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .location(new Path(testDirectory, "newDataLocation").toUri()) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to data location change"); } catch (ValidationException ex) { // expected } Assert.assertEquals( location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation()); }
Example #16
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testDatasetUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(), GenericData.Record.class)); pipeline.write(data, CrunchDatasets.asTarget( new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(10, datasetSize(outputDataset)); }
Example #17
Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0 | 6 votes |
@Test public void testCreateFromExistingPartitioned() throws Exception { command.datasets = Lists.newArrayList(existingPartitionedURI); command.run(); verify(console).debug(contains("Created"), eq(existingPartitionedURI)); PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder() .provided("version", "int") .build(); // load the new dataset and verify it Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI); Assert.assertEquals("Schema should match", USER_SCHEMA, users.getDescriptor().getSchema()); Assert.assertEquals("Should be partitioned with a provided partitioner", providedVersionStrategy, users.getDescriptor().getPartitionStrategy()); Assert.assertEquals("Should be Parquet", Formats.PARQUET, users.getDescriptor().getFormat()); }
Example #18
Source File: TestHiveDatasetURIsCompatibility.java From kite with Apache License 2.0 | 6 votes |
@Test public void testLoadChangedRelativePathURIMissingNamespace() { // this used to be a relative external URI, but is now a managed URI String uri = "dataset:hive:ds"; DatasetRepository repo = DatasetRepositories .repositoryFor("repo:hive:/tmp/data"); DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR) .location("file:/tmp/data/ds") // old location .build(); Dataset<GenericRecord> expected = repo.create( "default", "ds", withLocation, GenericRecord.class); Dataset<GenericRecord> actual = Datasets.load(uri); Assert.assertEquals("Should load existing dataset default.ds", expected, actual); Assert.assertEquals("URI should use actual namespace", "dataset:hive:default/ds", actual.getUri().toString()); Assert.assertTrue(Datasets.delete(uri)); }
Example #19
Source File: TestHiveDatasetURIsWithDefaultConfiguration.java From kite with Apache License 2.0 | 6 votes |
@Test public void testExternal() { DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive:/tmp/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<GenericRecord> ds = Datasets.load("dataset:hive:/tmp/data/ns/test"); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); repo.delete("ns", "test"); }
Example #20
Source File: TestHiveDatasetURIsCompatibility.java From kite with Apache License 2.0 | 6 votes |
@Test public void testLoadChangedAbsolutePathURIMissingNamespace() { // this used to be a relative external URI, but is now a managed URI String uri = "dataset:hive:/ds"; DatasetRepository repo = DatasetRepositories .repositoryFor("repo:hive:/tmp/data"); DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR) .location("file:/tmp/data/ds") // old location .build(); Dataset<GenericRecord> expected = repo.create( "default", "ds", withLocation, GenericRecord.class); Dataset<GenericRecord> actual = Datasets.load(uri); Assert.assertEquals("Should load existing dataset default.ds", expected, actual); Assert.assertEquals("URI should use actual namespace", "dataset:hive:default/ds", actual.getUri().toString()); Assert.assertTrue(Datasets.delete(uri)); }
Example #21
Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0 | 6 votes |
@Test public void testReadNullsWithPrimitivesAllowNullSchema() { final String name = "allowNullPrimitives"; try { repo.create(NAMESPACE, name, new DatasetDescriptor.Builder() .schema(ReflectData.AllowNull.get().getSchema(ObjectPoJo.class)) .build(), ObjectPoJo.class); // should load the dataset because PrimitivePoJo can be used to write final Dataset<PrimitivePoJo> dataset = repo.load( NAMESPACE, name, PrimitivePoJo.class); TestHelpers.assertThrows("AllowNull primitives cannot read nullable type", IncompatibleSchemaException.class, new Runnable() { @Override public void run() { dataset.newReader(); } }); } catch (RuntimeException e) { throw e; } finally { repo.delete(NAMESPACE, name); } }
Example #22
Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0 | 6 votes |
@Override @SuppressWarnings("unchecked") public void commitTask(TaskAttemptContext taskContext) throws IOException { DatasetRepository repo = getDatasetRepository(taskContext); boolean inTempRepo = repo instanceof TemporaryDatasetRepository; Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, getJobDatasetName(taskContext)); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { Dataset<E> taskAttemptDataset = repo.load(TEMP_NAMESPACE, taskAttemptDatasetName); ((Mergeable<Dataset<E>>) jobDataset).merge(taskAttemptDataset); if (!inTempRepo) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } } }
Example #23
Source File: TestKiteURIHandler.java From kite with Apache License 2.0 | 5 votes |
@Test public void checkURIExistsView() throws URIHandlerException, IOException{ DatasetRepository repository = newRepo(); Dataset<GenericRecord> dataset = repository.create("data","readymailbox", testDescriptor); View<GenericRecord> view = dataset.with("message", "hello"); ((Signalable<GenericRecord>)view).signalReady(); Assert.assertTrue(uriHandler.exists(view.getUri(), null)); }
Example #24
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
Example #25
Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0 | 5 votes |
private void writeRecord(Dataset<GenericRecord> dataset, int partition) { PartitionKey key = new PartitionKey(partition); DatasetWriter<GenericRecord> writer = ((PartitionedDataset<GenericRecord>) dataset).getPartition(key, true).newWriter(); try { GenericRecordBuilder recordBuilder = new GenericRecordBuilder( dataset.getDescriptor().getSchema()) .set("username", partition + "").set("email", partition + "@example.com"); writer.write(recordBuilder.build()); } finally { writer.close(); } }
Example #26
Source File: DatasetTestUtilities.java From kite with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static <E> void testPartitionKeysAreEqual(PartitionedDataset<E> ds, PartitionKey... expectedKeys) { Set<PartitionKey> expected = Sets.newHashSet(expectedKeys); Set<PartitionKey> actual = Sets.newHashSet(Iterables.transform(ds.getPartitions(), new Function<Dataset, PartitionKey>() { @Override public PartitionKey apply(Dataset input) { return ((FileSystemDataset) input).getPartitionKey(); } })); Assert.assertEquals(expected, actual); }
Example #27
Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0 | 5 votes |
@Test public void testUpdateFailsWithIncompatibleSchemaChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is propagated", testSchema, dataset .getDescriptor().getSchema()); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .requiredString("email") .requiredString("favoriteColor") // incompatible - no default .endRecord(); try { repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder( dataset.getDescriptor()).schema(testSchemaV2).build()); Assert.fail("Should fail due to incompatible update"); } catch (ValidationException e) { // expected } dataset = repo.load(NAMESPACE, NAME); Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset .getDescriptor().getSchema()); }
Example #28
Source File: FileSystemDatasetRepository.java From kite with Apache License 2.0 | 5 votes |
@Override public <E> Dataset<E> update(String namespace, String name, DatasetDescriptor descriptor, Class<E> type) { Preconditions.checkNotNull(namespace, "Namespace cannot be null"); Preconditions.checkNotNull(name, "Dataset name cannot be null"); Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); DatasetDescriptor oldDescriptor = metadataProvider.load(namespace, name); // oldDescriptor is valid if load didn't throw NoSuchDatasetException Compatibility.checkUpdate(oldDescriptor, descriptor); DatasetDescriptor updatedDescriptor = metadataProvider.update(namespace, name, descriptor); LOG.debug("Updated dataset: {} schema: {} location: {}", new Object[] { name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation() }); return new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .configuration(conf) .descriptor(updatedDescriptor) .type(type) .uri(new URIBuilder(getUri(), namespace, name).build()) .partitionKey(updatedDescriptor.isPartitioned() ? new PartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }
Example #29
Source File: TestHiveDatasetURIsWithDefaultConfiguration.java From kite with Apache License 2.0 | 5 votes |
@Test public void testManaged() { DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<GenericRecord> ds = Datasets.load("dataset:hive?dataset=test&namespace=ns"); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); repo.delete("ns", "test"); }
Example #30
Source File: TestProjection.java From kite with Apache License 2.0 | 5 votes |
@Test public void testSpecificProjectionAsType() throws IOException { Dataset<GenericRecord> original = Datasets.load(unbounded.getUri()); DatasetWriter<StandardEvent> writer = null; try { writer = original.asType(StandardEvent.class).newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } final View<SmallEvent> smallEvents = original.asType(SmallEvent.class); Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent), toSmallEvent(octEvent), toSmallEvent(novEvent)); assertContentEquals(expected, smallEvents); TestHelpers.assertThrows("Should not be able to write small events", IncompatibleSchemaException.class, new Runnable() { @Override public void run() { smallEvents.newWriter(); } }); }