Java Code Examples for org.kitesdk.data.Dataset#newReader()

The following examples show how to use org.kitesdk.data.Dataset#newReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadUserDatasetGeneric.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  Dataset<Record> users = Datasets.load(
      "dataset:hdfs:/tmp/data/users", Record.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Record> reader = null;
  try {
    reader = users.newReader();
    for (GenericRecord user : reader) {
      System.out.println(user);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 2
Source File: ReadHiveUserDatasetGeneric.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  Dataset<Record> users = Datasets.load(
      "dataset:hive?dataset=users", Record.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Record> reader = null;
  try {
    reader = users.newReader();
    for (GenericRecord user : users.newReader()) {
      System.out.println(user);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 3
Source File: ReadProductDatasetPojo.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the products dataset
  Dataset<Product> products = Datasets.load(
      "dataset:hdfs:/tmp/data/products", Product.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Product> reader = null;
  try {
    reader = products.newReader();
    for (Product product : reader) {
      System.out.println(product);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 4
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example 5
Source File: ReadMovies.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  Dataset<Record> movies = Datasets.load(
      "dataset:hdfs:/tmp/data/movies", Record.class);

  DatasetReader<Record> reader = null;
  try {
    reader = movies.newReader();
    for (Record rec : reader) {
      System.err.println("Movie: " + rec);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 6
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example 7
Source File: TestHiveImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private void verifyHiveDataset(String tableName, Object[][] valsArray) {
  String datasetUri = String.format("dataset:hive:default/%s",
      tableName.toLowerCase());
  assertTrue(Datasets.exists(datasetUri));
  Dataset dataset = Datasets.load(datasetUri);
  assertFalse(dataset.isEmpty());

  DatasetReader<GenericRecord> reader = dataset.newReader();
  try {
    List<String> expectations = new ArrayList<String>();
    if (valsArray != null) {
      for (Object[] vals : valsArray) {
        expectations.add(Arrays.toString(vals));
      }
    }

    while (reader.hasNext() && expectations.size() > 0) {
      String actual = Arrays.toString(
          convertGenericRecordToArray(reader.next()));
      assertTrue("Expect record: " + actual, expectations.remove(actual));
    }
    assertFalse(reader.hasNext());
    assertEquals(0, expectations.size());
  } finally {
    reader.close();
  }
}
 
Example 8
Source File: TestAllTables.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
public void testMultiTableImportAsParquetFormat() throws IOException {
  String [] argv = getArgv(new String[]{"--as-parquetfile"}, null);
  runImport(new ImportAllTablesTool(), argv);

  Path warehousePath = new Path(this.getWarehouseDir());
  int i = 0;
  for (String tableName : this.tableNames) {
    Path tablePath = new Path(warehousePath, tableName);
    Dataset dataset = Datasets.load("dataset:file:" + tablePath);

    // dequeue the expected value for this table. This
    // list has the same order as the tableNames list.
    String expectedVal = Integer.toString(i++) + ","
        + this.expectedStrings.get(0);
    this.expectedStrings.remove(0);

    DatasetReader<GenericRecord> reader = dataset.newReader();
    try {
      GenericRecord record = reader.next();
      String line = record.get(0) + "," + record.get(1);
      assertEquals("Table " + tableName + " expected a different string",
          expectedVal, line);
      assertFalse(reader.hasNext());
    } finally {
      reader.close();
    }
  }
}
 
Example 9
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 5 votes vote down vote up
private void checkRecords(Dataset<GenericRecord> dataset, int count, int start) {
  int cnt = start;
  DatasetReader<GenericRecord> reader = dataset.newReader();
  try {
    for (GenericRecord entity : reader) {
      HBaseDatasetRepositoryTest.compareEntitiesWithUtf8(cnt, entity);
      cnt++;
    }
    assertEquals(count, cnt - start);
  } finally {
    reader.close();
  }
}
 
Example 10
Source File: TestSpark.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
@SuppressWarnings("deprecation")
public void testSparkJob() throws Exception {
  Dataset<Record> inputDataset = repo.create("ns", "in",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STRING_SCHEMA)
        .format(format)
        .build(), Record.class);
  DatasetWriter<Record> writer = inputDataset.newWriter();
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("carrot"));
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("apple"));
  writer.close();


  Dataset<Record> outputDataset = repo.create("ns", "out",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STATS_SCHEMA)
        .format(format)
        .build(), Record.class);

  Job job = Job.getInstance();
  DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
  DatasetKeyOutputFormat.configure(job).writeTo(outputDataset);

  @SuppressWarnings("unchecked")
  JavaPairRDD<Record, Void> inputData = SparkTestHelper.getSparkContext()
      .newAPIHadoopRDD(job.getConfiguration(), DatasetKeyInputFormat.class,
          Record.class, Void.class);

  JavaPairRDD<String, Integer> mappedData = inputData.mapToPair(new ToJava());
  JavaPairRDD<String, Integer> sums = mappedData.reduceByKey(new Sum());
  JavaPairRDD<Record, Void> outputData = sums.mapToPair(new ToAvro());

  outputData.saveAsNewAPIHadoopDataset(job.getConfiguration());

  DatasetReader<Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());

}
 
Example 11
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchema() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}
 
Example 12
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchemaParquet() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());

  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}
 
Example 13
Source File: TestS3Dataset.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasics3n() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3n://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}
 
Example 14
Source File: TestS3Dataset.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasics3a() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3a://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}