Java Code Examples for org.kitesdk.data.Dataset#newReader()
The following examples show how to use
org.kitesdk.data.Dataset#newReader() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hdfs:/tmp/data/users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.newReader(); for (GenericRecord user : reader) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example 2
Source File: ReadHiveUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hive?dataset=users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.newReader(); for (GenericRecord user : users.newReader()) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example 3
Source File: ReadProductDatasetPojo.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the products dataset Dataset<Product> products = Datasets.load( "dataset:hdfs:/tmp/data/products", Product.class); // Get a reader for the dataset and read all the users DatasetReader<Product> reader = null; try { reader = products.newReader(); for (Product product : reader) { System.out.println(product); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example 4
Source File: ReadDataset.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the events dataset Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events"); // Get a reader for the dataset and read all the events DatasetReader<GenericRecord> reader = events.newReader(); try { for (GenericRecord event : reader) { System.out.println(event); } } finally { reader.close(); } return 0; }
Example 5
Source File: ReadMovies.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { Dataset<Record> movies = Datasets.load( "dataset:hdfs:/tmp/data/movies", Record.class); DatasetReader<Record> reader = null; try { reader = movies.newReader(); for (Record rec : reader) { System.err.println("Movie: " + rec); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example 6
Source File: ReadDataset.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the events dataset Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events"); // Get a reader for the dataset and read all the events DatasetReader<GenericRecord> reader = events.newReader(); try { for (GenericRecord event : reader) { System.out.println(event); } } finally { reader.close(); } return 0; }
Example 7
Source File: TestHiveImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private void verifyHiveDataset(String tableName, Object[][] valsArray) { String datasetUri = String.format("dataset:hive:default/%s", tableName.toLowerCase()); assertTrue(Datasets.exists(datasetUri)); Dataset dataset = Datasets.load(datasetUri); assertFalse(dataset.isEmpty()); DatasetReader<GenericRecord> reader = dataset.newReader(); try { List<String> expectations = new ArrayList<String>(); if (valsArray != null) { for (Object[] vals : valsArray) { expectations.add(Arrays.toString(vals)); } } while (reader.hasNext() && expectations.size() > 0) { String actual = Arrays.toString( convertGenericRecordToArray(reader.next())); assertTrue("Expect record: " + actual, expectations.remove(actual)); } assertFalse(reader.hasNext()); assertEquals(0, expectations.size()); } finally { reader.close(); } }
Example 8
Source File: TestAllTables.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
public void testMultiTableImportAsParquetFormat() throws IOException { String [] argv = getArgv(new String[]{"--as-parquetfile"}, null); runImport(new ImportAllTablesTool(), argv); Path warehousePath = new Path(this.getWarehouseDir()); int i = 0; for (String tableName : this.tableNames) { Path tablePath = new Path(warehousePath, tableName); Dataset dataset = Datasets.load("dataset:file:" + tablePath); // dequeue the expected value for this table. This // list has the same order as the tableNames list. String expectedVal = Integer.toString(i++) + "," + this.expectedStrings.get(0); this.expectedStrings.remove(0); DatasetReader<GenericRecord> reader = dataset.newReader(); try { GenericRecord record = reader.next(); String line = record.get(0) + "," + record.get(1); assertEquals("Table " + tableName + " expected a different string", expectedVal, line); assertFalse(reader.hasNext()); } finally { reader.close(); } } }
Example 9
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 5 votes |
private void checkRecords(Dataset<GenericRecord> dataset, int count, int start) { int cnt = start; DatasetReader<GenericRecord> reader = dataset.newReader(); try { for (GenericRecord entity : reader) { HBaseDatasetRepositoryTest.compareEntitiesWithUtf8(cnt, entity); cnt++; } assertEquals(count, cnt - start); } finally { reader.close(); } }
Example 10
Source File: TestSpark.java From kite with Apache License 2.0 | 4 votes |
@Test @SuppressWarnings("deprecation") public void testSparkJob() throws Exception { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(TestMapReduce.STRING_SCHEMA) .format(format) .build(), Record.class); DatasetWriter<Record> writer = inputDataset.newWriter(); writer.write(newStringRecord("apple")); writer.write(newStringRecord("banana")); writer.write(newStringRecord("banana")); writer.write(newStringRecord("carrot")); writer.write(newStringRecord("apple")); writer.write(newStringRecord("apple")); writer.close(); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .property("kite.allow.csv", "true") .schema(TestMapReduce.STATS_SCHEMA) .format(format) .build(), Record.class); Job job = Job.getInstance(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset); @SuppressWarnings("unchecked") JavaPairRDD<Record, Void> inputData = SparkTestHelper.getSparkContext() .newAPIHadoopRDD(job.getConfiguration(), DatasetKeyInputFormat.class, Record.class, Void.class); JavaPairRDD<String, Integer> mappedData = inputData.mapToPair(new ToJava()); JavaPairRDD<String, Integer> sums = mappedData.reduceByKey(new Sum()); JavaPairRDD<Record, Void> outputData = sums.mapToPair(new ToAvro()); outputData.saveAsNewAPIHadoopDataset(job.getConfiguration()); DatasetReader<Record> reader = outputDataset.newReader(); Map<String, Integer> counts = new HashMap<String, Integer>(); for (Record record : reader) { counts.put(record.get("name").toString(), (Integer) record.get("count")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
Example 11
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 4 votes |
@Test public void testUseReaderSchema() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example 12
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 4 votes |
@Test public void testUseReaderSchemaParquet() throws IOException { // Create a schema with only a username, so we can test reading it // with an enhanced record structure. Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord") .fields() .requiredString("username") .endRecord(); // create the dataset Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder() .format(Formats.PARQUET).schema(oldRecordSchema).build()); Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder() .format(Formats.PARQUET).schema(oldRecordSchema).build()); Record oldUser = new Record(oldRecordSchema); oldUser.put("username", "user"); DatasetWriter<Record> writer = in.newWriter(); try { writer.write(oldUser); } finally { writer.close(); } Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); // read data from updated dataset that has the new schema. // At this point, User class has the old schema PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(), NewUserRecord.class)); PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(), Avros.records(NewUserRecord.class)); pipeline.write(processed, CrunchDatasets.asTarget(out)); DatasetReader reader = out.newReader(); Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded()); try { // there should be one record that is equal to our old user generic record. Assert.assertEquals(oldUser, reader.next()); Assert.assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example 13
Source File: TestS3Dataset.java From kite with Apache License 2.0 | 4 votes |
@Test public void testBasics3n() { // only run this test if credentials are present Assume.assumeTrue(ID != null && !ID.isEmpty()); String uri = "dataset:s3n://" + BUCKET + "/ns/test"; // make sure the dataset doesn't already exist Datasets.delete(uri); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<String> dataset = Datasets.create(uri, descriptor, String.class); List<String> expected = Lists.newArrayList("a", "b", "time"); DatasetWriter<String> writer = null; try { writer = dataset.newWriter(); for (String s : expected) { writer.write(s); } } finally { if (writer != null) { writer.close(); } } DatasetReader<String> reader = null; try { reader = dataset.newReader(); Assert.assertEquals("Should match written strings", expected, Lists.newArrayList((Iterator<String>) reader)); } finally { if (reader != null) { reader.close(); } } // clean up Datasets.delete(uri); }
Example 14
Source File: TestS3Dataset.java From kite with Apache License 2.0 | 4 votes |
@Test public void testBasics3a() { // only run this test if credentials are present Assume.assumeTrue(ID != null && !ID.isEmpty()); String uri = "dataset:s3a://" + BUCKET + "/ns/test"; // make sure the dataset doesn't already exist Datasets.delete(uri); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral("\"string\"") .build(); Dataset<String> dataset = Datasets.create(uri, descriptor, String.class); List<String> expected = Lists.newArrayList("a", "b", "time"); DatasetWriter<String> writer = null; try { writer = dataset.newWriter(); for (String s : expected) { writer.write(s); } } finally { if (writer != null) { writer.close(); } } DatasetReader<String> reader = null; try { reader = dataset.newReader(); Assert.assertEquals("Should match written strings", expected, Lists.newArrayList((Iterator<String>) reader)); } finally { if (reader != null) { reader.close(); } } // clean up Datasets.delete(uri); }