Java Code Examples for org.kitesdk.data.Datasets#create()

The following examples show how to use org.kitesdk.data.Datasets#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedDataset() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset);
  writeUserToView(dataset);

  DatasetDescriptor expected = dataset.getDescriptor();
  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertEquals("Should succeed and find an equivalent descriptor",
      expected, actual);
}
 
Example 2
Source File: CreateEvents.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(List<String> args) throws Exception {

  Preconditions.checkState(!Datasets.exists(uri),
      "events dataset already exists");

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(StandardEvent.class).build();

  View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class);
  DatasetWriter<StandardEvent> writer = events.newWriter();
  try {
    while (System.currentTimeMillis() - baseTimestamp < 36000) {
      writer.write(generateRandomEvent());
    }
  } finally {
    writer.close();
  }

  System.out.println("Generated " + counter + " events");

  return 0;
}
 
Example 3
Source File: TestFileSystemDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testDatasetNotPartitioned() {
  Datasets.delete("dataset:file:/tmp/datasets/ns/test");
  final Dataset<GenericRecord> ds = Datasets.create(
      "dataset:file:/tmp/datasets/ns/test",
      new DatasetDescriptor.Builder()
          .schema(schema)
          .build());

  Assert.assertEquals("Should work for empty relative directory",
      ds, FileSystemDatasets.viewForUri(ds, "file:/tmp/datasets/ns/test"));

  TestHelpers.assertThrows("Should reject paths in a non-partitioned dataset",
      IllegalArgumentException.class, new Runnable() {
        @Override
        public void run() {
          FileSystemDatasets.viewForUri(ds, "y=2014/m=03/d=14");
        }
      });
}
 
Example 4
Source File: CorrelateEvents.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(List<String> args) throws Exception {

  String inputUri = uri;
  String outputUri = "dataset:hive?dataset=correlated_events";

  if (args.size() == 1) {
    outputUri = args.get(0);
  }

  Preconditions.checkState(Datasets.exists(inputUri),
      "input dataset doesn't exists");

  if (!Datasets.exists(outputUri)) {
    Datasets.create(outputUri, new DatasetDescriptor.Builder()
        .format("avro")
        .schema(CorrelatedEvents.class)
        .build());
  }
  CorrelateEventsTask task = new CorrelateEventsTask(inputUri, outputUri);
  task.run();

  return 0;
}
 
Example 5
Source File: TestFileSystemPartitionView.java    From kite with Apache License 2.0 6 votes vote down vote up
@Before
public void createTestDatasets() {
  Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
  Datasets.delete("dataset:file:/tmp/datasets/partitioned");

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(TestRecord.class)
      .build();
  unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned",
      descriptor, TestRecord.class);

  descriptor = new DatasetDescriptor.Builder(descriptor)
      .partitionStrategy(new PartitionStrategy.Builder()
          .hash("id", 4)
          .build())
      .build();
  partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned",
      descriptor, TestRecord.class);

  writeTestRecords(unpartitioned);
  writeTestRecords(partitioned);
}
 
Example 6
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateChangedAbsolutePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/ds";

  Datasets.create(uri, DESCRIPTOR);
  Table table = metastore.getTable("default", "ds");

  Assert.assertNotNull("Table should be found under default.ds", table);
  Assert.assertTrue("Should create a managed table: " + table.getSd().getLocation(),
      HiveAbstractMetadataProvider.isManaged(table));

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 7
Source File: TestCompactCommandCluster.java    From kite with Apache License 2.0 5 votes vote down vote up
@Before
public void createDatasets() throws Exception {
  repoUri = "hdfs://" + getDFS().getUri().getAuthority() + "/tmp/data";
  TestUtil.run("delete", unpartitioned, "-r", repoUri, "-d", "target/data");

  File csvFile = temp.newFile("users.csv");
  csvFile.delete();
  String csv = csvFile.toString();
  BufferedWriter writer = Files.newWriter(
      csvFile, CSVSchemaCommand.SCHEMA_CHARSET);

  writer.append("id,username,email\n");
  numRecords = 30;
  for(int i = 0; i < numRecords; i++) {
    writer.append(i+",test"+i+",test"+i+"@example.com\n");
  }
  writer.close();

  TestUtil.run("-v", "csv-schema", csv, "-o", avsc, "--class", "User");
  TestUtil.run("create", unpartitioned, "-s", avsc,
      "-r", repoUri, "-d", "target/data");

  URI dsUri = URIBuilder.build("repo:" + repoUri, "default", partitioned);
  Datasets.<Object, Dataset<Object>>create(dsUri, new DatasetDescriptor.Builder()
      .partitionStrategy(new PartitionStrategy.Builder()
          .hash("id", 2)
          .build())
      .schema(SchemaBuilder.record("User").fields()
          .requiredLong("id")
          .optionalString("username")
          .optionalString("email")
          .endRecord())
      .build(), Object.class);


  TestUtil.run("csv-import", csv, unpartitioned, "-r", repoUri, "-d", "target/data");
  TestUtil.run("csv-import", csv, partitioned, "-r", repoUri, "-d", "target/data");
}
 
Example 8
Source File: CreateStagedDataset.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // where the schema is stored
  URI schemaURI = URI.create("resource:simple-log.avsc");

  // create a Parquet dataset for long-term storage
  Datasets.create("dataset:file:/tmp/data/logs",
      new DatasetDescriptor.Builder()
          .format(Formats.PARQUET)
          .schemaUri(schemaURI)
          .partitionStrategy(new PartitionStrategy.Builder()
              .year("timestamp", "year")
              .month("timestamp", "month")
              .day("timestamp", "day")
              .build())
          .build(), Record.class);

  // create an Avro dataset to temporarily hold data
  Datasets.create("dataset:file:/tmp/data/logs_staging",
      new DatasetDescriptor.Builder()
          .format(Formats.AVRO)
          .schemaUri(schemaURI)
          .partitionStrategy(new PartitionStrategy.Builder()
              .day("timestamp", "day")
              .build())
          .build(), Record.class);

  return 0;
}
 
Example 9
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateChangedAbsolutePathURIMissingNamespaceWithURILocation() {
  String uri = "dataset:hive:/ds?location=file:/tmp/data/ns/ds";

  Datasets.create(uri, DESCRIPTOR);

  Table table = metastore.getTable("default", "ds");

  Assert.assertNotNull("Table should be found under default.ds", table);
  Assert.assertTrue("Should create an external table",
      HiveAbstractMetadataProvider.isExternal(table));

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 10
Source File: CreateHiveUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create("dataset:hive?dataset=users",
      descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }

  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 11
Source File: CreateProductDatasetPojo.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Create a dataset of products with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(Product.class)
      .build();
  Dataset<Product> products = Datasets.create(
      "dataset:hdfs:/tmp/data/products", descriptor, Product.class);

  // Get a writer for the dataset and write some products to it
  DatasetWriter<Product> writer = null;
  try {
    writer = products.newWriter();
    int i = 0;
    for (String name : names) {
      Product product = new Product();
      product.setName(name);
      product.setId(i++);
      writer.write(product);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 12
Source File: TestKiteProcessorsCluster.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicStoreToHive() throws IOException {
    String datasetUri = "dataset:hive:ns/test";

    Dataset<Record> dataset = Datasets.create(datasetUri, descriptor, Record.class);

    TestRunner runner = TestRunners.newTestRunner(StoreInKiteDataset.class);
    runner.assertNotValid();

    runner.setProperty(StoreInKiteDataset.KITE_DATASET_URI, datasetUri);
    runner.assertValid();

    List<Record> users = Lists.newArrayList(
            user("a", "[email protected]"),
            user("b", "[email protected]"),
            user("c", "[email protected]")
    );

    runner.enqueue(streamFor(users));
    runner.run();

    runner.assertAllFlowFilesTransferred("success", 1);
    List<Record> stored = Lists.newArrayList(
            (Iterable<Record>) dataset.newReader());
    Assert.assertEquals("Records should match", users, stored);

    Datasets.delete(datasetUri);
}
 
Example 13
Source File: CreateUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 14
Source File: CreateUserDatasetGenericPartitioned.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a partition strategy that hash partitions on username with 10 buckets
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("favoriteColor", "favorite_color")
      .build();

  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .partitionStrategy(partitionStrategy)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 15
Source File: TestKiteStorageProcessor.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Before
public void createDataset() throws Exception {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schema(TestUtil.USER_SCHEMA)
            .build();
    this.datasetUri = "dataset:file:" + temp.newFolder("ns", "temp").toString();
    this.dataset = Datasets.create(datasetUri, descriptor, Record.class);
}
 
Example 16
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testAbsolutePathURI() {
  // recognized as a absolute because there are 3 path components
  String uri = "dataset:hive:/tmp/data/ns/ds";

  Datasets.create(uri, DESCRIPTOR);
  Table table = metastore.getTable("ns", "ds");

  Assert.assertNotNull("Table should be found under ns.ds", table);
  Assert.assertTrue("Should create an external table",
      HiveAbstractMetadataProvider.isExternal(table));

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 17
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateChangedAbsolutePathURIWithURILocation() {
  String uri = "dataset:hive:/ns/ds?location=file:/tmp/data/ns/ds";

  Datasets.create(uri, DESCRIPTOR);

  Table table = metastore.getTable("ns", "ds");

  Assert.assertNotNull("Table should be found under ns.ds", table);
  Assert.assertTrue("Should create an external table",
      HiveAbstractMetadataProvider.isExternal(table));

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 18
Source File: TestViewUris.java    From kite with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void createTestDataset() {
  Datasets.delete("dataset:file:/tmp/test_name");
  test = Datasets.create("dataset:file:/tmp/test_name",
      new DatasetDescriptor.Builder()
          .schema(SCHEMA)
          .partitionStrategy(STRATEGY)
          .build());
}
 
Example 19
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testRelativePathURI() {
  // recognized as a deprecated form because there are 3 path components
  String uri = "dataset:hive:data/ns/ds";

  Datasets.create(uri, DESCRIPTOR);
  Table table = metastore.getTable("ns", "ds");

  Assert.assertNotNull("Table should be found under ns.ds", table);
  Assert.assertTrue("Should create an external table",
      HiveAbstractMetadataProvider.isExternal(table));

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 20
Source File: TestS3Dataset.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasics3a() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3a://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}