Java Code Examples for org.kitesdk.data.DatasetReader#close()

The following examples show how to use org.kitesdk.data.DatasetReader#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testOverrideTypeMapping() throws IOException {
  String [] types = { "INT" };
  String [] vals = { "10" };
  createTableWithColTypes(types, vals);

  String [] extraArgs = { "--map-column-java", "DATA_COL0=String"};
  runImport(getOutputArgv(true, extraArgs));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "DATA_COL0", Type.STRING);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("DATA_COL0", "10", record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example 2
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testFirstUnderscoreInColumnName() throws IOException {
  String [] names = { "_NAME" };
  String [] types = { "INT" };
  String [] vals = { "1987" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "__NAME", Type.INT);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("__NAME", 1987, record1.get("__NAME"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example 3
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNonIdentCharactersInColumnName() throws IOException {
  String [] names = { "test_p-a+r/quet" };
  String [] types = { "INT" };
  String [] vals = { "2015" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "TEST_P_A_R_QUET", Type.INT);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("TEST_P_A_R_QUET", 2015, record1.get("TEST_P_A_R_QUET"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example 4
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNullableParquetImport() throws IOException, SQLException {
  String [] types = { "INT" };
  String [] vals = { null };
  createTableWithColTypes(types, vals);

  runImport(getOutputArgv(true, null));

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertNull(record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example 5
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example 6
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testIncrementalParquetImport() throws IOException, SQLException {
  String [] types = { "INT" };
  String [] vals = { "1" };
  createTableWithColTypes(types, vals);

  runImport(getOutputArgv(true, null));
  runImport(getOutputArgv(true, new String[]{"--append"}));

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals(1, record1.get("DATA_COL0"));
    record1 = reader.next();
    assertEquals(1, record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example 7
Source File: ReadMovies.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  Dataset<Record> movies = Datasets.load(
      "dataset:hdfs:/tmp/data/movies", Record.class);

  DatasetReader<Record> reader = null;
  try {
    reader = movies.newReader();
    for (Record rec : reader) {
      System.err.println("Movie: " + rec);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 8
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example 9
Source File: UserProfileDatasetExample.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Print the user profiles and actions for all users with the provided last
 * name
 * 
 * This method demonstrates how to open a scanner with a start key. It's using
 * the composite dao, so the records it returns will be a composite of both
 * the profile model and actions model.
 * 
 * @param lastName
 *          The last name of users to scan.
 */
public void printUserProfileActionsForLastName(String lastName) {
  // TODO: use a reader with a start key
  DatasetReader<UserProfileActionsModel2> reader = userProfileActionsDataset.newReader();
  try {
    for (UserProfileActionsModel2 entity : reader) {
      UserProfileModel2 userProfile = entity.getUserProfileModel();
      if (userProfile.getLastName().equals(lastName)) {
        System.out.println(entity.toString());
      }
    }
  } finally {
    // readers need to be closed.
    reader.close();
  }
}
 
Example 10
Source File: ReadProductDatasetPojo.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the products dataset
  Dataset<Product> products = Datasets.load(
      "dataset:hdfs:/tmp/data/products", Product.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Product> reader = null;
  try {
    reader = products.newReader();
    for (Product product : reader) {
      System.out.println(product);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 11
Source File: TestMapReduce.java    From kite with Apache License 2.0 6 votes vote down vote up
private void checkOutput(boolean existingPresent) {
  DatasetReader<GenericData.Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (GenericData.Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());
  if (existingPresent) {
    Assert.assertEquals(4, counts.get("date").intValue());
  } else {
    Assert.assertNull(counts.get("date"));
  }
}
 
Example 12
Source File: TestFileSystemDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private int readTestUsersInPartition(FileSystemDataset<Record> ds, PartitionKey key,
    String subpartitionName) {
  int readCount = 0;
  DatasetReader<Record> reader = null;
  try {
    PartitionedDataset<Record> partition = ds.getPartition(key, false);
    if (subpartitionName != null) {
      List<FieldPartitioner> fieldPartitioners =
          Accessor.getDefault().getFieldPartitioners(partition.getDescriptor()
              .getPartitionStrategy());
      Assert.assertEquals(1, fieldPartitioners.size());
      Assert.assertEquals(subpartitionName, fieldPartitioners.get(0)
          .getName());
    }
    reader = partition.newReader();
    for (GenericData.Record actualRecord : reader) {
      Assert.assertEquals(actualRecord.toString(), key.get(0), (actualRecord
          .get("username").hashCode() & Integer.MAX_VALUE) % 2);
      if (key.getLength() > 1) {
        Assert.assertEquals(key.get(1),
            (actualRecord.get("email").hashCode() & Integer.MAX_VALUE) % 3);
      }
      readCount++;
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  return readCount;
}
 
Example 13
Source File: DaoViewTest.java    From kite with Apache License 2.0 5 votes vote down vote up
private void validRange(View<TestEntity> range, int startIdx, int endIdx) {
  int cnt = startIdx;
  DatasetReader<TestEntity> reader = range.newReader();
  try {
    for (TestEntity entity : reader) {
      Assert.assertEquals(Integer.toString(cnt), entity.getPart1());
      Assert.assertEquals(Integer.toString(cnt), entity.getPart2());
      cnt++;
    }
  } finally {
    reader.close();
  }
  Assert.assertEquals(endIdx, cnt);
}
 
Example 14
Source File: UserProfileDatasetExample.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Print all user profiles.
 * 
 * This method demonstrates how to open a reader that will read the entire
 * table. It has no start or stop keys specified.
 */
public void printUserProfies() {
  DatasetReader<UserProfileModel2> reader = userProfileDataset.newReader();
  try {
    for (UserProfileModel2 userProfile : reader) {
      System.out.println(userProfile.toString());
    }
  } finally {
    // readers need to be closed.
    reader.close();
  }
}
 
Example 15
Source File: ReadUserDataset.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  // Dataset is named [table].[entity]
  RandomAccessDataset<User> users = Datasets.load(
      "dataset:hbase:quickstart.cloudera/users.User", User.class);

  // Get an accessor for the dataset and look up a user by username
  Key key = new Key.Builder(users).add("username", "bill").build();
  System.out.println(users.get(key));
  System.out.println("----");

  // Get a reader for the dataset and read the users from "bill" onwards
  DatasetReader<User> reader = null;
  try {
    reader = users.with("username", "bill").newReader();
    for (User user : reader) {
      System.out.println(user);
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example 16
Source File: TestAllTables.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
public void testMultiTableImportAsParquetFormat() throws IOException {
  String [] argv = getArgv(new String[]{"--as-parquetfile"}, null);
  runImport(new ImportAllTablesTool(), argv);

  Path warehousePath = new Path(this.getWarehouseDir());
  int i = 0;
  for (String tableName : this.tableNames) {
    Path tablePath = new Path(warehousePath, tableName);
    Dataset dataset = Datasets.load("dataset:file:" + tablePath);

    // dequeue the expected value for this table. This
    // list has the same order as the tableNames list.
    String expectedVal = Integer.toString(i++) + ","
        + this.expectedStrings.get(0);
    this.expectedStrings.remove(0);

    DatasetReader<GenericRecord> reader = dataset.newReader();
    try {
      GenericRecord record = reader.next();
      String line = record.get(0) + "," + record.get(1);
      assertEquals("Table " + tableName + " expected a different string",
          expectedVal, line);
      assertFalse(reader.hasNext());
    } finally {
      reader.close();
    }
  }
}
 
Example 17
Source File: DaoViewTest.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testRange() {
  populateTestEntities(10);

  final AbstractRefinableView<TestEntity> range = new DaoView<TestEntity>(ds, TestEntity.class)
          .fromAfter(NAMES[0], "1").to(NAMES[0], "9")
          .fromAfter(NAMES[1], "1").to(NAMES[1], "9");

  // Test entity range checks
  // Note that these are strings, not ints, so lexicographic ordering is used
  Assert.assertTrue(range.includes(newTestEntity("5", "5")));
  Assert.assertTrue(range.includes(newTestEntity("5", "55")));
  Assert.assertTrue(range.includes(newTestEntity("9", "89")));
  Assert.assertTrue(range.includes(newTestEntity("9", "9")));
  Assert.assertFalse(range.includes(newTestEntity("1", "1")));
  Assert.assertFalse(range.includes(newTestEntity("1", "0")));
  Assert.assertFalse(range.includes(newTestEntity("1", "10")));
  Assert.assertFalse(range.includes(newTestEntity("9", "99")));

  DatasetReader<TestEntity> reader = range.newReader();
  int cnt = 2;
  try {
    for (TestEntity entity : reader) {
      Assert.assertEquals(Integer.toString(cnt), entity.getPart1());
      Assert.assertEquals(Integer.toString(cnt), entity.getPart2());
      cnt++;
    }
  } finally {
    reader.close();
  }

  Assert.assertEquals(10, cnt);
}
 
Example 18
Source File: TestHiveImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private void verifyHiveDataset(String tableName, Object[][] valsArray) {
  String datasetUri = String.format("dataset:hive:default/%s",
      tableName.toLowerCase());
  assertTrue(Datasets.exists(datasetUri));
  Dataset dataset = Datasets.load(datasetUri);
  assertFalse(dataset.isEmpty());

  DatasetReader<GenericRecord> reader = dataset.newReader();
  try {
    List<String> expectations = new ArrayList<String>();
    if (valsArray != null) {
      for (Object[] vals : valsArray) {
        expectations.add(Arrays.toString(vals));
      }
    }

    while (reader.hasNext() && expectations.size() > 0) {
      String actual = Arrays.toString(
          convertGenericRecordToArray(reader.next()));
      assertTrue("Expect record: " + actual, expectations.remove(actual));
    }
    assertFalse(reader.hasNext());
    assertEquals(0, expectations.size());
  } finally {
    reader.close();
  }
}
 
Example 19
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchemaParquet() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());

  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}
 
Example 20
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchema() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}