org.kitesdk.data.Dataset Java Exaples

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testTargetView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));
  View<Record> outputView = outputDataset.with("username", "test-0");

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  View<GenericRecord> inputView = inputDataset
      .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7"))
      .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7"));
  Assert.assertEquals(6, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 6, 2);
}

Source File: TestProjection.java From kite with Apache License 2.0

6 votes

@Test
public void testSpecificProjectionLoad() throws IOException {
  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = unbounded.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<SmallEvent> dataset = repo.load(
      "ns", unbounded.getDataset().getName(),
      SmallEvent.class);

  Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent),
      toSmallEvent(octEvent), toSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}

Source File: TestExternalBackwardCompatibility.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateWithUpdatedURI() {
  Dataset<GenericRecord> updated = Datasets.update(
      "dataset:hive:/tmp/datasets/default/test",
      new DatasetDescriptor.Builder(descriptor)
          .property("added.property", "true")
          .build());
  Assert.assertNotNull("Update should succeed", updated);

  DatasetDescriptor stored =
      HiveUtils.descriptorForTable(conf, metastore.getTable("default", "test"));

  Assert.assertEquals("Should update default.test descriptor",
      stored, updated.getDescriptor());

  Assert.assertEquals("Added property should be present",
      stored.getProperty("added.property"), "true");
}

Source File: TestHiveDatasetURIs.java From kite with Apache License 2.0

6 votes

@Test
public void testExternalHDFSQueryOptions() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data?" + hdfsQueryArgs);
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets
      .<Object, Dataset<Object>>load("dataset:hive:/tmp/data/ns/test?" + hdfsQueryArgsOld, Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}

Source File: ReadProductDatasetPojo.java From kite-examples with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
  // Load the products dataset
  Dataset<Product> products = Datasets.load(
      "dataset:hdfs:/tmp/data/products", Product.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Product> reader = null;
  try {
    reader = products.newReader();
    for (Product product : reader) {
      System.out.println(product);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}

Source File: ReadDataset.java From kite-examples with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}

Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
@Test
public void testNewPartitionIsVisibleToHive() throws Exception {
  final String NAME2 = "test2";

  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .hash("username", 2).build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  writeRecord(dataset, 0);

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

6 votes

@Test
public void testUnpartitionedDataset() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset);
  writeUserToView(dataset);

  DatasetDescriptor expected = dataset.getDescriptor();
  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertEquals("Should succeed and find an equivalent descriptor",
      expected, actual);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testTargetViewProvidedPartition() throws IOException {
    PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build();

    Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
    Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

    View<Record> inputView = inputDataset.with("version", "test-version-0");

    writeTestUsers(inputView, 1);

    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("version", "test-version-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(
            CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Test
public void testUpdateFailsWithLocationChange() {
  ensureCreated();
  Dataset<Record> dataset = repo.load(NAMESPACE, NAME);
  URI location = dataset.getDescriptor().getLocation();

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
          .location(new Path(testDirectory, "newDataLocation").toUri())
          .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to data location change");
  } catch (ValidationException ex) {
    // expected
  }

  Assert.assertEquals(
      location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation());
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testDatasetUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(),
          GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(
      new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(10, datasetSize(outputDataset));
}

Source File: TestCreateDatasetWithExistingData.java From kite with Apache License 2.0

6 votes

@Test
public void testCreateFromExistingPartitioned() throws Exception {
  command.datasets = Lists.newArrayList(existingPartitionedURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingPartitionedURI));

  PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder()
      .provided("version", "int")
      .build();

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertEquals("Should be partitioned with a provided partitioner",
      providedVersionStrategy, users.getDescriptor().getPartitionStrategy());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}

Source File: TestHiveDatasetURIsCompatibility.java From kite with Apache License 2.0

6 votes

@Test
public void testLoadChangedRelativePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}

Source File: TestHiveDatasetURIsWithDefaultConfiguration.java From kite with Apache License 2.0

6 votes

@Test
public void testExternal() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive:/tmp/data");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<GenericRecord> ds = Datasets.load("dataset:hive:/tmp/data/ns/test");

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}

Source File: TestHiveDatasetURIsCompatibility.java From kite with Apache License 2.0

6 votes

@Test
public void testLoadChangedAbsolutePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

6 votes

@Test
public void testReadNullsWithPrimitivesAllowNullSchema() {
  final String name = "allowNullPrimitives";
  try {
    repo.create(NAMESPACE, name, new DatasetDescriptor.Builder()
        .schema(ReflectData.AllowNull.get().getSchema(ObjectPoJo.class))
        .build(), ObjectPoJo.class);

    // should load the dataset because PrimitivePoJo can be used to write
    final Dataset<PrimitivePoJo> dataset = repo.load(
        NAMESPACE, name, PrimitivePoJo.class);
    TestHelpers.assertThrows("AllowNull primitives cannot read nullable type",
        IncompatibleSchemaException.class, new Runnable() {
          @Override
          public void run() {
            dataset.newReader();
          }
        });

  } catch (RuntimeException e) {
    throw e;
  } finally {
    repo.delete(NAMESPACE, name);
  }
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public void commitTask(TaskAttemptContext taskContext) throws IOException {
  DatasetRepository repo = getDatasetRepository(taskContext);
  boolean inTempRepo = repo instanceof TemporaryDatasetRepository;

  Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, getJobDatasetName(taskContext));
  String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
  if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
    Dataset<E> taskAttemptDataset = repo.load(TEMP_NAMESPACE, taskAttemptDatasetName);
    ((Mergeable<Dataset<E>>) jobDataset).merge(taskAttemptDataset);
    if (!inTempRepo) {
      repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName);
    }
  }
}

Source File: TestKiteURIHandler.java From kite with Apache License 2.0

5 votes

@Test
public void checkURIExistsView() throws URIHandlerException, IOException{
  DatasetRepository repository = newRepo();
  Dataset<GenericRecord> dataset = repository.create("data","readymailbox", testDescriptor);

  View<GenericRecord> view = dataset.with("message", "hello");
  ((Signalable<GenericRecord>)view).signalReady();

  Assert.assertTrue(uriHandler.exists(view.getUri(), null));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test
public void testSignalReadyOutputView() {
  Assume.assumeTrue(!Hadoop.isHadoop1());
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
  View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
  Assert.assertEquals(2, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(2, datasetSize(outputView));

  Assert.assertFalse("Output dataset should not be signaled ready",
      ((Signalable)outputDataset).isReady());
  Assert.assertTrue("Output view should be signaled ready",
      ((Signalable)outputView).isReady());
}

Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0

5 votes

private void writeRecord(Dataset<GenericRecord> dataset, int partition) {
  PartitionKey key = new PartitionKey(partition);
  DatasetWriter<GenericRecord> writer =
      ((PartitionedDataset<GenericRecord>) dataset).getPartition(key, true).newWriter();
  try {
    GenericRecordBuilder recordBuilder = new GenericRecordBuilder(
        dataset.getDescriptor().getSchema())
        .set("username", partition + "").set("email", partition + "@example.com");
    writer.write(recordBuilder.build());
  } finally {
    writer.close();
  }

}

Source File: DatasetTestUtilities.java From kite with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static <E> void testPartitionKeysAreEqual(PartitionedDataset<E> ds,
    PartitionKey... expectedKeys) {
  Set<PartitionKey> expected = Sets.newHashSet(expectedKeys);
  Set<PartitionKey> actual = Sets.newHashSet(Iterables.transform(ds.getPartitions(),
      new Function<Dataset, PartitionKey>() {
    @Override
    public PartitionKey apply(Dataset input) {
      return ((FileSystemDataset) input).getPartitionKey();
    }
  }));
  Assert.assertEquals(expected, actual);
}

Source File: TestFileSystemDatasetRepository.java From kite with Apache License 2.0

5 votes

@Test
public void testUpdateFailsWithIncompatibleSchemaChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder()
      .schema(testSchema).build());

  Assert.assertEquals("Dataset name is propagated", NAME,
      dataset.getName());
  Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
      .getDescriptor().getSchema());

  Schema testSchemaV2 = SchemaBuilder.record("user").fields()
      .requiredString("username")
      .requiredString("email")
      .requiredString("favoriteColor") // incompatible - no default
      .endRecord();

  try {
    repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(
        dataset.getDescriptor()).schema(testSchemaV2).build());
    Assert.fail("Should fail due to incompatible update");
  } catch (ValidationException e) {
    // expected
  }
  dataset = repo.load(NAMESPACE, NAME);
  Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
      .getDescriptor().getSchema());
}

Source File: FileSystemDatasetRepository.java From kite with Apache License 2.0

5 votes

@Override
public <E> Dataset<E> update(String namespace, String name,
                             DatasetDescriptor descriptor, Class<E> type) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");
  Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

  DatasetDescriptor oldDescriptor = metadataProvider.load(namespace, name);

  // oldDescriptor is valid if load didn't throw NoSuchDatasetException
  Compatibility.checkUpdate(oldDescriptor, descriptor);

  DatasetDescriptor updatedDescriptor = metadataProvider.update(namespace, name, descriptor);

  LOG.debug("Updated dataset: {} schema: {} location: {}", new Object[] {
      name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation() });

  return new FileSystemDataset.Builder<E>()
      .namespace(namespace)
      .name(name)
      .configuration(conf)
      .descriptor(updatedDescriptor)
      .type(type)
      .uri(new URIBuilder(getUri(), namespace, name).build())
      .partitionKey(updatedDescriptor.isPartitioned() ? new PartitionKey() : null)
      .partitionListener(getPartitionListener())
      .build();
}

Source File: TestHiveDatasetURIsWithDefaultConfiguration.java From kite with Apache License 2.0

5 votes

@Test
public void testManaged() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<GenericRecord> ds = Datasets.load("dataset:hive?dataset=test&namespace=ns");

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}

Source File: TestProjection.java From kite with Apache License 2.0

5 votes

@Test
public void testSpecificProjectionAsType() throws IOException {
  Dataset<GenericRecord> original = Datasets.load(unbounded.getUri());

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.asType(StandardEvent.class).newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  final View<SmallEvent> smallEvents = original.asType(SmallEvent.class);

  Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent),
      toSmallEvent(octEvent), toSmallEvent(novEvent));

  assertContentEquals(expected, smallEvents);

  TestHelpers.assertThrows("Should not be able to write small events",
      IncompatibleSchemaException.class, new Runnable() {
        @Override
        public void run() {
          smallEvents.newWriter();
        }
      });
}

org.kitesdk.data.Dataset Java Examples