org.kitesdk.data.DatasetReader Java Examples
The following examples show how to use
org.kitesdk.data.DatasetReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatasetTestUtilities.java From kite with Apache License 2.0 | 6 votes |
public static <R> void checkReaderBehavior( DatasetReader<R> reader, int totalRecords, RecordValidator<R> validator) { // this is now used for both initialized and not initialized records because // initialization now happens automatically in newReader if (!reader.isOpen() && reader instanceof InitializeAccessor) { ((InitializeAccessor) reader).initialize(); } try { Assert.assertTrue("Reader should be open", reader.isOpen()); checkReaderIteration(reader, totalRecords, validator); } finally { reader.close(); } Assert.assertFalse("Reader is open after close()", reader.isOpen()); }
Example #2
Source File: ReadDataset.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the events dataset Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events"); // Get a reader for the dataset and read all the events DatasetReader<GenericRecord> reader = events.newReader(); try { for (GenericRecord event : reader) { System.out.println(event); } } finally { reader.close(); } return 0; }
Example #3
Source File: ReadMovies.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { Dataset<Record> movies = Datasets.load( "dataset:hdfs:/tmp/data/movies", Record.class); DatasetReader<Record> reader = null; try { reader = movies.newReader(); for (Record rec : reader) { System.err.println("Movie: " + rec); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #4
Source File: ReadDataset.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the events dataset Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events"); // Get a reader for the dataset and read all the events DatasetReader<GenericRecord> reader = events.newReader(); try { for (GenericRecord event : reader) { System.out.println(event); } } finally { reader.close(); } return 0; }
Example #5
Source File: ReadProductDatasetPojo.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the products dataset Dataset<Product> products = Datasets.load( "dataset:hdfs:/tmp/data/products", Product.class); // Get a reader for the dataset and read all the users DatasetReader<Product> reader = null; try { reader = products.newReader(); for (Product product : reader) { System.out.println(product); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #6
Source File: ReadUserDatasetGenericOnePartition.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hdfs:/tmp/data/users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.with("favoriteColor", "green").newReader(); for (GenericRecord user : reader) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #7
Source File: ReadHiveUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hive?dataset=users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.newReader(); for (GenericRecord user : users.newReader()) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #8
Source File: ReadUserDatasetGeneric.java From kite-examples with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hdfs:/tmp/data/users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.newReader(); for (GenericRecord user : reader) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
Example #9
Source File: TestMapReduce.java From kite with Apache License 2.0 | 6 votes |
private void checkOutput(boolean existingPresent) { DatasetReader<GenericData.Record> reader = outputDataset.newReader(); Map<String, Integer> counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(record.get("name").toString(), (Integer) record.get("count")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); if (existingPresent) { Assert.assertEquals(4, counts.get("date").intValue()); } else { Assert.assertNull(counts.get("date")); } }
Example #10
Source File: UserProfileDatasetExample.java From kite with Apache License 2.0 | 6 votes |
/** * Print the user profiles and actions for all users with the provided last * name * * This method demonstrates how to open a scanner with a start key. It's using * the composite dao, so the records it returns will be a composite of both * the profile model and actions model. * * @param lastName * The last name of users to scan. */ public void printUserProfileActionsForLastName(String lastName) { // TODO: use a reader with a start key DatasetReader<UserProfileActionsModel2> reader = userProfileActionsDataset.newReader(); try { for (UserProfileActionsModel2 entity : reader) { UserProfileModel2 userProfile = entity.getUserProfileModel(); if (userProfile.getLastName().equals(lastName)) { System.out.println(entity.toString()); } } } finally { // readers need to be closed. reader.close(); } }
Example #11
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testIncrementalParquetImport() throws IOException, SQLException { String [] types = { "INT" }; String [] vals = { "1" }; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, null)); runImport(getOutputArgv(true, new String[]{"--append"})); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #12
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testQueryImport() throws IOException, SQLException { String [] types = { "INT" }; String [] vals = { "1" }; createTableWithColTypes(types, vals); runImport(getOutputQueryArgv(true, null)); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #13
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testNullableParquetImport() throws IOException, SQLException { String [] types = { "INT" }; String [] vals = { null }; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, null)); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertNull(record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #14
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testNonIdentCharactersInColumnName() throws IOException { String [] names = { "test_p-a+r/quet" }; String [] types = { "INT" }; String [] vals = { "2015" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "TEST_P_A_R_QUET", Type.INT); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("TEST_P_A_R_QUET", 2015, record1.get("TEST_P_A_R_QUET")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #15
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testFirstUnderscoreInColumnName() throws IOException { String [] names = { "_NAME" }; String [] types = { "INT" }; String [] vals = { "1987" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "__NAME", Type.INT); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("__NAME", 1987, record1.get("__NAME")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #16
Source File: TestParquetImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 6 votes |
public void testOverrideTypeMapping() throws IOException { String [] types = { "INT" }; String [] vals = { "10" }; createTableWithColTypes(types, vals); String [] extraArgs = { "--map-column-java", "DATA_COL0=String"}; runImport(getOutputArgv(true, extraArgs)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "DATA_COL0", Type.STRING); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("DATA_COL0", "10", record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } }
Example #17
Source File: TestInputFormatValueReader.java From kite with Apache License 2.0 | 5 votes |
@Override public DatasetReader<Text> newReader() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "value") .schema(Schema.create(Schema.Type.STRING)) .build(); return new InputFormatReader<Text>(localfs, userFile, descriptor); }
Example #18
Source File: DatasetTestUtilities.java From kite with Apache License 2.0 | 5 votes |
public static <E> Set<E> materialize(View<E> ds) { Set<E> records = Sets.newHashSet(); DatasetReader<E> reader = null; try { reader = ds.newReader(); for (E record : reader) { records.add(record); } } finally { if (reader != null) { reader.close(); } } return records; }
Example #19
Source File: TestFileSystemDataset.java From kite with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") private int readTestUsersInPartition(FileSystemDataset<Record> ds, PartitionKey key, String subpartitionName) { int readCount = 0; DatasetReader<Record> reader = null; try { PartitionedDataset<Record> partition = ds.getPartition(key, false); if (subpartitionName != null) { List<FieldPartitioner> fieldPartitioners = Accessor.getDefault().getFieldPartitioners(partition.getDescriptor() .getPartitionStrategy()); Assert.assertEquals(1, fieldPartitioners.size()); Assert.assertEquals(subpartitionName, fieldPartitioners.get(0) .getName()); } reader = partition.newReader(); for (GenericData.Record actualRecord : reader) { Assert.assertEquals(actualRecord.toString(), key.get(0), (actualRecord .get("username").hashCode() & Integer.MAX_VALUE) % 2); if (key.getLength() > 1) { Assert.assertEquals(key.get(1), (actualRecord.get("email").hashCode() & Integer.MAX_VALUE) % 3); } readCount++; } } finally { if (reader != null) { reader.close(); } } return readCount; }
Example #20
Source File: TestFileSystemDatasetReader.java From kite with Apache License 2.0 | 5 votes |
@Override public DatasetReader<Record> newReader() throws IOException { return new FileSystemDatasetReader<Record>( LocalFileSystem.getInstance(), new Path(Resources.getResource("data/strings-100.avro").getFile()), STRING_SCHEMA, Record.class); }
Example #21
Source File: TestMultiFileDatasetReader.java From kite with Apache License 2.0 | 5 votes |
@Override public DatasetReader newReader() throws IOException { return new MultiFileDatasetReader<Record>( FileSystem.get(new Configuration()), Lists.newArrayList(TEST_FILE, TEST_FILE), DESCRIPTOR, CONSTRAINTS, ACCESSOR); }
Example #22
Source File: TestCSVFileReader.java From kite with Apache License 2.0 | 5 votes |
@Override public DatasetReader<GenericData.Record> newReader() throws IOException { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(VALIDATOR_SCHEMA) .build(); return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); }
Example #23
Source File: AbstractRefinableView.java From kite with Apache License 2.0 | 5 votes |
@Override public boolean isEmpty() { DatasetReader<E> reader = null; try { // use a reader because files may be present but empty reader = newReader(); return !reader.hasNext(); } finally { if (reader != null) { reader.close(); } } }
Example #24
Source File: DatasetTestUtilities.java From kite with Apache License 2.0 | 5 votes |
public static <R> void checkReaderIteration(DatasetReader<R> reader, int expectedRecordCount, RecordValidator<R> validator) { int recordCount = 0; Assert.assertTrue("Reader is not open", reader.isOpen()); Assert.assertTrue("Reader has no records, expected " + expectedRecordCount, (expectedRecordCount == 0) || reader.hasNext()); for (R record : reader) { // add calls to hasNext, which should not affect the iteration validator.validate(record, recordCount); Assert.assertNotNull(record); reader.hasNext(); recordCount++; } Assert.assertFalse("Reader is empty, but hasNext is true", reader.hasNext()); // verify that NoSuchElementException is thrown when hasNext returns false try { reader.next(); Assert.fail("Reader did not throw NoSuchElementException"); } catch (NoSuchElementException ex) { // this is the correct behavior } Assert.assertTrue("Reader is empty, but should be open", reader.isOpen()); // verify the correct number of records were produced // if hasNext advances the reader, then this will be wrong Assert.assertEquals("Incorrect number of records", expectedRecordCount, recordCount); }
Example #25
Source File: TestInputFormatKeyReader.java From kite with Apache License 2.0 | 5 votes |
@Override public DatasetReader<LongWritable> newReader() throws IOException { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat") .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "key") .schema(Schema.create(Schema.Type.LONG)) .build(); return new InputFormatReader<LongWritable>(localfs, userFile, descriptor); }
Example #26
Source File: TestAvroWriter.java From kite with Apache License 2.0 | 5 votes |
@Test public void testCommitFlushedRecords() throws IOException { init(fsWriter); List<Record> written = Lists.newArrayList(); long i; for (i = 0; i < 10000; i += 1) { Record record = record(i, "test-" + i); fsWriter.write(record); written.add(record); } ((Flushable) fsWriter).flush(); for (i = 10000; i < 11000; i += 1) { fsWriter.write(record(i, "test-" + i)); } // put the writer into an error state, simulating either: // 1. A failed record with an IOException or unknown RuntimeException // 2. A failed flush or sync for IncrementableWriters fsWriter.state = ReaderWriterState.ERROR; fsWriter.close(); FileStatus[] stats = fs.listStatus(testDirectory, PathFilters.notHidden()); Assert.assertEquals("Should contain a visible data file", 1, stats.length); DatasetReader<Record> reader = newReader(stats[0].getPath(), TEST_SCHEMA); Assert.assertEquals("Should match written records", written, Lists.newArrayList((Iterator) init(reader))); }
Example #27
Source File: UserProfileDatasetExample.java From kite with Apache License 2.0 | 5 votes |
/** * Print all user profiles. * * This method demonstrates how to open a reader that will read the entire * table. It has no start or stop keys specified. */ public void printUserProfies() { DatasetReader<UserProfileModel2> reader = userProfileDataset.newReader(); try { for (UserProfileModel2 userProfile : reader) { System.out.println(userProfile.toString()); } } finally { // readers need to be closed. reader.close(); } }
Example #28
Source File: DaoViewTest.java From kite with Apache License 2.0 | 5 votes |
@Test public void testRange() { populateTestEntities(10); final AbstractRefinableView<TestEntity> range = new DaoView<TestEntity>(ds, TestEntity.class) .fromAfter(NAMES[0], "1").to(NAMES[0], "9") .fromAfter(NAMES[1], "1").to(NAMES[1], "9"); // Test entity range checks // Note that these are strings, not ints, so lexicographic ordering is used Assert.assertTrue(range.includes(newTestEntity("5", "5"))); Assert.assertTrue(range.includes(newTestEntity("5", "55"))); Assert.assertTrue(range.includes(newTestEntity("9", "89"))); Assert.assertTrue(range.includes(newTestEntity("9", "9"))); Assert.assertFalse(range.includes(newTestEntity("1", "1"))); Assert.assertFalse(range.includes(newTestEntity("1", "0"))); Assert.assertFalse(range.includes(newTestEntity("1", "10"))); Assert.assertFalse(range.includes(newTestEntity("9", "99"))); DatasetReader<TestEntity> reader = range.newReader(); int cnt = 2; try { for (TestEntity entity : reader) { Assert.assertEquals(Integer.toString(cnt), entity.getPart1()); Assert.assertEquals(Integer.toString(cnt), entity.getPart2()); cnt++; } } finally { reader.close(); } Assert.assertEquals(10, cnt); }
Example #29
Source File: DaoViewTest.java From kite with Apache License 2.0 | 5 votes |
private void validRange(View<TestEntity> range, int startIdx, int endIdx) { int cnt = startIdx; DatasetReader<TestEntity> reader = range.newReader(); try { for (TestEntity entity : reader) { Assert.assertEquals(Integer.toString(cnt), entity.getPart1()); Assert.assertEquals(Integer.toString(cnt), entity.getPart2()); cnt++; } } finally { reader.close(); } Assert.assertEquals(endIdx, cnt); }
Example #30
Source File: TestHBaseActionModifiable.java From kite with Apache License 2.0 | 5 votes |
private void checkRecord(boolean shouldExist) { DatasetReader<TestEntity> dsReader = ds.newReader(); try { if (shouldExist) { assertTrue(dsReader.hasNext()); } else { assertFalse(dsReader.hasNext()); } } finally { dsReader.close(); } }