org.kitesdk.data.Dataset#newWriter

Source File: TestProjection.java From kite with Apache License 2.0

6 votes

@Test
public void testMixedProjection() throws IOException {
  Dataset<StandardEvent> original = repo.create("ns", "mixedProjection",
      new DatasetDescriptor.Builder()
          .schema(StandardEvent.class)
          .build(), StandardEvent.class);

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}

Source File: TestWriteReflectReadGeneric.java From kite with Apache License 2.0

6 votes

@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", GenericRecord.class);
}

Source File: TestReadCustomGeneric.java From kite with Apache License 2.0

6 votes

@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", TestGenericRecord.class);
}

Source File: CreateUserDatasetGeneric.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateUserDatasetGenericParquet.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .format(Formats.PARQUET)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateProductDatasetPojo.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

  // Create a dataset of products with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(Product.class)
      .build();
  Dataset<Product> products = Datasets.create(
      "dataset:hdfs:/tmp/data/products", descriptor, Product.class);

  // Get a writer for the dataset and write some products to it
  DatasetWriter<Product> writer = null;
  try {
    writer = products.newWriter();
    int i = 0;
    for (String name : names) {
      Product product = new Product();
      product.setName(name);
      product.setId(i++);
      writer.write(product);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateHiveUserDatasetGeneric.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create("dataset:hive?dataset=users",
      descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }

  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: CreateUserDatasetGenericPartitioned.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  // Create a partition strategy that hash partitions on username with 10 buckets
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("favoriteColor", "favorite_color")
      .build();

  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .partitionStrategy(partitionStrategy)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

5 votes

private void writeRecords(Dataset<GenericRecord> dataset, int count) {
  DatasetWriter<GenericRecord> writer = dataset.newWriter();
  try {
    for (int i = 0; i < count; ++i) {
      GenericRecord entity = HBaseDatasetRepositoryTest.createGenericEntity(i);
      writer.write(entity);
    }
  } finally {
    writer.close();
  }
}

Source File: TestSimpleView.java From kite with Apache License 2.0

5 votes

@Test
public void testRefineIdentity() throws Exception {
    PartitionStrategy strategy = new PartitionStrategy.Builder()
            .identity("user_id")
            .build();

    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:standard_event.avsc")
            .partitionStrategy(strategy)
            .build();

    // Create a separate dataset to avoid conflicts with the above.
    Dataset<StandardEvent> identityDataset = repo.create(
        "ns", "test_identity", descriptor);

    DatasetWriter<StandardEvent> writer = null;

    try {
        writer = identityDataset.newWriter();
        writer.write(sepEvent);
        writer.write(octEvent);
        writer.write(novEvent);
    } finally {
        Closeables.close(writer, false);
    }

    assertContentEquals(Sets.newHashSet(sepEvent, novEvent),
            identityDataset.with("user_id", 0L));
}

Source File: TestProjection.java From kite with Apache License 2.0

5 votes

@Test
public void testReflectProjectionLoad() throws IOException {
  Dataset<ReflectStandardEvent> original = repo.create(
      "ns", "reflectProjection",
      new DatasetDescriptor.Builder()
          .schema(ReflectStandardEvent.class)
          .build(),
      ReflectStandardEvent.class);

  DatasetWriter<ReflectStandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(new ReflectStandardEvent(sepEvent));
    writer.write(new ReflectStandardEvent(octEvent));
    writer.write(new ReflectStandardEvent(novEvent));
  } finally {
    Closeables.close(writer, false);
  }

  View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}

Source File: TestHiveExternalDatasetRepository.java From kite with Apache License 2.0

4 votes

@SuppressWarnings("deprecation")
@Test
public void testDeletedPartitionRemovedFromHive() throws Exception {
  final String NAME2 = "test2";

  // use a multi-item partition strategy to ensure the system
  // can convert it to the corresponding Hive partition
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("username")
      .identity("email").build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  GenericData.Record record1 = new GenericRecordBuilder(
      dataset.getDescriptor().getSchema())
      .set("username", "0").set("email", "0").build();

  GenericData.Record record2 = new GenericRecordBuilder(
      dataset.getDescriptor().getSchema())
      .set("username", "1").set("email", "1").build();

  DatasetWriter<GenericRecord> writer = dataset.newWriter();

  try
  {
    writer.write(record1);
    writer.write(record2);

  } finally {

    writer.close();
  }

  Assert.assertEquals("Should be two partitions", 2,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

  RefinableView view = dataset.with("username", "0").with("email", "0");

  view.deleteAll();

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

  view = dataset.with("username", "1").with("email", "1");

  view.deleteAll();

  Assert.assertEquals("Should be no partitions", 0,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());
}

Source File: TestS3Dataset.java From kite with Apache License 2.0

4 votes

@Test
public void testBasics3a() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3a://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}

Source File: TestS3Dataset.java From kite with Apache License 2.0

4 votes

@Test
public void testBasics3n() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3n://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

4 votes

@Test
public void testUseReaderSchemaParquet() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());

  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

4 votes

@Test
public void testUseReaderSchema() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}

Source File: TestSpark.java From kite with Apache License 2.0

4 votes

@Test
@SuppressWarnings("deprecation")
public void testSparkJob() throws Exception {
  Dataset<Record> inputDataset = repo.create("ns", "in",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STRING_SCHEMA)
        .format(format)
        .build(), Record.class);
  DatasetWriter<Record> writer = inputDataset.newWriter();
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("carrot"));
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("apple"));
  writer.close();


  Dataset<Record> outputDataset = repo.create("ns", "out",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STATS_SCHEMA)
        .format(format)
        .build(), Record.class);

  Job job = Job.getInstance();
  DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
  DatasetKeyOutputFormat.configure(job).writeTo(outputDataset);

  @SuppressWarnings("unchecked")
  JavaPairRDD<Record, Void> inputData = SparkTestHelper.getSparkContext()
      .newAPIHadoopRDD(job.getConfiguration(), DatasetKeyInputFormat.class,
          Record.class, Void.class);

  JavaPairRDD<String, Integer> mappedData = inputData.mapToPair(new ToJava());
  JavaPairRDD<String, Integer> sums = mappedData.reduceByKey(new Sum());
  JavaPairRDD<Record, Void> outputData = sums.mapToPair(new ToAvro());

  outputData.saveAsNewAPIHadoopDataset(job.getConfiguration());

  DatasetReader<Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());

}

Source File: GenerateSimpleLogs.java From kite-examples with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
  // going to generate a lot of random log messages
  final Random rand = new Random();

  // data is written to the staging dataset
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);

  // this is going to build our simple log records
  GenericRecordBuilder builder = new GenericRecordBuilder(
      staging.getDescriptor().getSchema());

  // generate timestamps 1 second apart starting 1 day ago
  final Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
  final long yesterday = now.getTimeInMillis() - DAY_IN_MILLIS;

  DatasetWriter<Record> writer = null;
  try {
    writer = staging.newWriter();

    // generate 15,000 messages, each 5 seconds apart, starting 24 hours ago
    // this is a little less than 24 hours worth of messages
    for (int second : Ranges.closed(0, 15000).asSet(DiscreteDomains.integers())) {
      LOG.info("Generating log message " + second);

      builder.set("timestamp", yesterday + second * 5000);
      builder.set("component", "GenerateSimpleLogs");

      int level = rand.nextInt(LOG_LEVELS.length);
      builder.set("level", LOG_LEVELS[level]);
      builder.set("message", LOG_MESSAGES[level]);

      writer.write(builder.build());
    }

    if (writer instanceof Flushable) {
      ((Flushable) writer).flush();
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}

Java Code Examples for org.kitesdk.data.Dataset#newWriter()