org.kitesdk.data.View Java Exaples

Source File: CrunchDatasets.java From kite with Apache License 2.0

6 votes

private GetStorageKey(View<E> view, int numPartitionWriters) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  // get serializable versions of transient objects
  this.strategyString = descriptor.getPartitionStrategy()
      .toString(false /* no white space */);
  this.schemaString = descriptor.getSchema()
      .toString(false /* no white space */);
  this.type = view.getType();
  if (view instanceof AbstractRefinableView) {
    this.constraints = ((AbstractRefinableView) view).getConstraints()
        .toQueryMap();
  } else {
    this.constraints = null;
  }
  this.numPartitionWriters = numPartitionWriters > 0 ? numPartitionWriters : 1;
}

Source File: CreateEvents.java From kite-examples with Apache License 2.0

6 votes

@Override
public int run(List<String> args) throws Exception {

  Preconditions.checkState(!Datasets.exists(uri),
      "events dataset already exists");

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(StandardEvent.class).build();

  View<StandardEvent> events = Datasets.create(uri, descriptor, StandardEvent.class);
  DatasetWriter<StandardEvent> writer = events.newWriter();
  try {
    while (System.currentTimeMillis() - baseTimestamp < 36000) {
      writer.write(generateRandomEvent());
    }
  } finally {
    writer.close();
  }

  System.out.println("Generated " + counter + " events");

  return 0;
}

Source File: TestFileSystemDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testViewForMissingPartitionNames() {
  Path path = new Path("2014/3/14");

  // like PathConversion, this uses names from the partition strategy
  // and will accept partitions that don't have a "name=" component
  View<GenericRecord> view = FileSystemDatasets.viewForUri(
      dataset, path.toString());
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  Constraints expected = ((AbstractRefinableView<GenericRecord>)
      dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints();
  Constraints actual = ((AbstractRefinableView<GenericRecord>) view)
      .getConstraints();
  Assert.assertEquals("Constraints should match expected",
      expected, actual);
}

Source File: DatasetTestUtilities.java From kite with Apache License 2.0

6 votes

public static void writeTestUsers(View<GenericData.Record> view, int count, int start, String... fields) {
  DatasetWriter<GenericData.Record> writer = null;
  try {
    writer = view.newWriter();
    for (int i = start; i < count + start; i++) {
      GenericRecordBuilder recordBuilder = new GenericRecordBuilder(view.getDataset().getDescriptor
          ().getSchema()).set("username", "test-" + i);
      for (String field : fields) {
        recordBuilder.set(field, field + "-" + i);
      }
      writer.write(recordBuilder.build());
    }
    if (writer instanceof Flushable) {
      ((Flushable) writer).flush();
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  View<GenericRecord> inputView = inputDataset
      .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7"))
      .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7"));
  Assert.assertEquals(6, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 6, 2);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestFileSystemDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testViewForIncompleteUri() {
  Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03");

  View<GenericRecord> view = FileSystemDatasets.viewForUri(
      dataset, path.toString());
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  Constraints expected = ((AbstractRefinableView<GenericRecord>)
      dataset.with("y", 2014).with("m", 3)).getConstraints();
  Constraints actual = ((AbstractRefinableView<GenericRecord>) view)
      .getConstraints();
  Assert.assertEquals("Constraints should match expected",
      expected, actual);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testWriteModeOverwrite() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 1, 0);
  writeTestUsers(outputDataset, 1, 1);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset),
      Target.WriteMode.OVERWRITE);

  pipeline.run();

  checkTestUsers(outputDataset, 1);
}

Source File: TestFileSystemDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testViewForDifferentPartitionNames() {
  Path path = new Path("year=2014/month=3/day=14");

  // like PathConversion, this uses names from the partition strategy
  // and will accept partitions that have a different "name=" component
  View<GenericRecord> view = FileSystemDatasets.viewForUri(
      dataset, path.toString());
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  Constraints expected = ((AbstractRefinableView<GenericRecord>)
      dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints();
  Constraints actual = ((AbstractRefinableView<GenericRecord>) view)
      .getConstraints();
  Assert.assertEquals("Constraints should match expected",
      expected, actual);
}

Source File: TestFileSystemDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testViewForUri() {
  Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03/d=14");

  View<GenericRecord> view = FileSystemDatasets.viewForUri(
      dataset, "file:" + path);
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  view = FileSystemDatasets.viewForUri(
      dataset, path.toString());
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  Constraints expected = ((AbstractRefinableView<GenericRecord>)
      dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints();
  Constraints actual = ((AbstractRefinableView<GenericRecord>) view)
      .getConstraints();
  Assert.assertEquals("Constraints should match expected",
      expected, actual);
}

Source File: DatasetKeyInputFormat.java From kite with Apache License 2.0

6 votes

/**
 * Adds configuration for {@code DatasetKeyInputFormat} to read from the
 * given {@link Dataset} or {@link View} instance.
 *
 * @param view a dataset or view
 * @return this for method chaining
 */
public ConfigBuilder readFrom(View<?> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  // if this is a partitioned dataset, add the partition location
  if (view instanceof FileSystemDataset) {
    conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation()));
  }
  // add descriptor properties to the config
  for (String property : descriptor.listProperties()) {
    conf.set(property, descriptor.getProperty(property));
  }

  if (DataModelUtil.isGeneric(view.getType())) {
    Schema datasetSchema = view.getDataset().getDescriptor().getSchema();
    // only set the read schema if the view is a projection
    if (!datasetSchema.equals(view.getSchema())) {
      withSchema(view.getSchema());
    }
  } else {
    withType(view.getType());
  }

  conf.set(KITE_INPUT_URI, view.getUri().toString());
  return this;
}

Source File: TestMapReduce.java From kite with Apache License 2.0

6 votes

@Test
@SuppressWarnings("deprecation")
public void testSignalReadyOutputView() throws Exception {
  Assume.assumeTrue(!Hadoop.isHadoop1());
  populateInputDataset();
  populateOutputDataset(); // existing output will be overwritten

  Job job = new Job();
  DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);

  job.setMapperClass(LineCountMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);

  job.setReducerClass(GenericStatsReducer.class);

  View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot");
  DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class);

  Assert.assertTrue(job.waitForCompletion(true));

  Assert.assertFalse("Output dataset should not be signaled ready",
      ((Signalable)outputDataset).isReady());
  Assert.assertTrue("Output view should be signaled ready",
      ((Signalable)outputView).isReady());
}

Source File: TestFileSystemDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testIgnoresAuthority() {
  Path path = new Path("/tmp/datasets/ns/test/y=2014/m=03/d=14");

  View<GenericRecord> view = FileSystemDatasets.viewForUri(
      dataset, "file://127.0.0.1/tmp/datasets/ns/test/y=2014/m=03/d=14");
  Assert.assertEquals("Should create correct view",
      view, dataset.getPartitionView(path));

  Constraints expected = ((AbstractRefinableView<GenericRecord>)
      dataset.with("y", 2014).with("m", 3).with("d", 14)).getConstraints();
  Constraints actual = ((AbstractRefinableView<GenericRecord>) view)
      .getConstraints();
  Assert.assertEquals("Constraints should match expected",
      expected, actual);
}

Source File: TestViewUris.java From kite with Apache License 2.0

6 votes

@Test
public void testURIStringEquality() {
  for(int i = 0; i < 10; i++) {
    String a = UUID.randomUUID().toString();
    String b = UUID.randomUUID().toString();
    String originalUri = "view:file:/tmp/test_name?color="+ a + "," + b;
    View<GenericRecord> view = Datasets.load(originalUri);
    String afterUri = view.getUri().toString();
    if(!originalUri.equals(afterUri)) {
      System.out.println("Iteration: " + i);
      System.out.println("Start: " + originalUri);
      System.out.println("End  : " + afterUri);
    }
    Assert.assertEquals(originalUri, afterUri);
  }
}

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

@Override
public boolean canReplace(View<E> part) {
  if (part instanceof FileSystemView) {
    return equals(part.getDataset()) &&
        ((FileSystemView) part).getConstraints().alignedWithBoundaries();
  } else if (part instanceof FileSystemDataset) {
    return equals(part);
  }
  return false;
}

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

View<E> viewForUri(URI location) {
  Preconditions.checkNotNull(location, "Partition location cannot be null");
  PartitionView<E> view = getPartitionView(location);
  if (view == unbounded) {
    return this;
  }
  return view;
}

Source File: TestProjection.java From kite with Apache License 2.0

5 votes

@Test
public void testSpecificProjectionAsType() throws IOException {
  Dataset<GenericRecord> original = Datasets.load(unbounded.getUri());

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.asType(StandardEvent.class).newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  final View<SmallEvent> smallEvents = original.asType(SmallEvent.class);

  Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent),
      toSmallEvent(octEvent), toSmallEvent(novEvent));

  assertContentEquals(expected, smallEvents);

  TestHelpers.assertThrows("Should not be able to write small events",
      IncompatibleSchemaException.class, new Runnable() {
        @Override
        public void run() {
          smallEvents.newWriter();
        }
      });
}

Source File: DaoViewTest.java From kite with Apache License 2.0

5 votes

private void validRange(View<TestEntity> range, int startIdx, int endIdx) {
  int cnt = startIdx;
  DatasetReader<TestEntity> reader = range.newReader();
  try {
    for (TestEntity entity : reader) {
      Assert.assertEquals(Integer.toString(cnt), entity.getPart1());
      Assert.assertEquals(Integer.toString(cnt), entity.getPart2());
      cnt++;
    }
  } finally {
    reader.close();
  }
  Assert.assertEquals(endIdx, cnt);
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
private static <E> View<E> load(JobContext jobContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
  Class<E> type = getType(jobContext);

  String outputUri = conf.get(KITE_OUTPUT_URI);
  return Datasets.<E, View<E>>load(outputUri, type);
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) {
  Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext);
  DefaultConfiguration.init(conf);
  View<E> view = load(taskAttemptContext);
  return usePerTaskAttemptDatasets(view, conf) ?
      new MergeOutputCommitter<E>() : new NullOutputCommitter();
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@Override
public void checkOutputSpecs(JobContext jobContext) {
  // The committer setup will fail if the output dataset does not exist
  View<E> target = load(jobContext);
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
  switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) {
    case APPEND:
      break;
    case OVERWRITE:
      // if the merge won't use replace, then delete the existing data
      if (!canReplace(target)) {
        target.deleteAll();
      }
      break;
    default:
    case DEFAULT:
      boolean isReady = false;
      if (target instanceof Signalable) {
        isReady = ((Signalable)target).isReady();
      }
      if (isReady || !target.isEmpty()) {
        throw new DatasetException(
            "View is not empty or has been signaled as ready: " + target);
      }
      break;
  }
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) {
  Configuration conf = Hadoop.TaskAttemptContext
      .getConfiguration.invoke(taskAttemptContext);
  View<E> target = load(taskAttemptContext);
  View<E> working;

  if (usePerTaskAttemptDatasets(target, conf)) {
    working = loadOrCreateTaskAttemptView(taskAttemptContext);
  } else {
    working = target;
  }

  boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false);

  String partitionDir = conf.get(KITE_PARTITION_DIR);
  if (working.getDataset().getDescriptor().isPartitioned() &&
      partitionDir != null) {
    if (!(target instanceof FileSystemDataset)) {
      throw new UnsupportedOperationException("Partitions only supported for " +
          "FileSystemDataset. Dataset: " + target);
    }
    FileSystemDataset fsDataset = (FileSystemDataset) target;
    PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
    if (key != null && !key.getValues().isEmpty()) {
      working = fsDataset.getPartition(key, true);
    }
    return new DatasetRecordWriter<E>(working, copyRecords);
  } else {
    return new DatasetRecordWriter<E>(working, copyRecords);
  }
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public void commitJob(JobContext jobContext) throws IOException {
  Configuration conf = Hadoop.JobContext
      .getConfiguration.invoke(jobContext);
  DatasetRepository repo = getDatasetRepository(jobContext);
  boolean isTemp = repo instanceof TemporaryDatasetRepository;

  String jobDatasetName = getJobDatasetName(jobContext);
  View<E> targetView = load(jobContext);
  Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, jobDatasetName);
  WriteMode mode = conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT);
  if (mode == WriteMode.OVERWRITE && canReplace(targetView)) {
    ((Replaceable<View<E>>) targetView.getDataset()).replace(targetView, jobDataset);
  } else {
    ((Mergeable<Dataset<E>>) targetView.getDataset()).merge(jobDataset);
  }

  if (targetView instanceof Signalable) {
    ((Signalable)targetView).signalReady();
  }

  if (isTemp) {
    ((TemporaryDatasetRepository) repo).delete();
  } else {
    repo.delete(TEMP_NAMESPACE, jobDatasetName);
  }
}

Source File: DatasetSink.java From kite with Apache License 2.0

5 votes

private DatasetWriter<GenericRecord> newWriter(
    final UserGroupInformation login, final URI uri) {
  View<GenericRecord> view = KerberosUtil.runPrivileged(login,
      new PrivilegedExceptionAction<Dataset<GenericRecord>>() {
        @Override
        public Dataset<GenericRecord> run() {
          return Datasets.load(uri);
        }
      });

  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  String formatName = descriptor.getFormat().getName();
  Preconditions.checkArgument(allowedFormats().contains(formatName),
      "Unsupported format: " + formatName);

  Schema newSchema = descriptor.getSchema();
  if (targetSchema == null || !newSchema.equals(targetSchema)) {
    this.targetSchema = descriptor.getSchema();
    // target dataset schema has changed, invalidate all readers based on it
    readers.invalidateAll();
  }

  this.reuseDatum = !("parquet".equals(formatName));
  this.datasetName = view.getDataset().getName();

  return view.newWriter();
}

Source File: TestProjection.java From kite with Apache License 2.0

5 votes

@Test
public void testReflectProjectionLoad() throws IOException {
  Dataset<ReflectStandardEvent> original = repo.create(
      "ns", "reflectProjection",
      new DatasetDescriptor.Builder()
          .schema(ReflectStandardEvent.class)
          .build(),
      ReflectStandardEvent.class);

  DatasetWriter<ReflectStandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(new ReflectStandardEvent(sepEvent));
    writer.write(new ReflectStandardEvent(octEvent));
    writer.write(new ReflectStandardEvent(novEvent));
  } finally {
    Closeables.close(writer, false);
  }

  View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

/**
 * Adds configuration for {@code DatasetKeyOutputFormat} to write to the
 * given {@link Dataset} or {@link View} instance.
 *
 * @param view a dataset or view
 * @return this for method chaining
 */
public ConfigBuilder writeTo(View<?> view) {
  if (view instanceof FileSystemDataset) {
    FileSystemDataset dataset = (FileSystemDataset) view;
    conf.set(KITE_PARTITION_DIR,
        String.valueOf(dataset.getDescriptor().getLocation()));
  }
  withType(view.getType());
  return writeTo(view.getUri());
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

private void runCheckpointPipeline(View<Record> inputView,
    View<Record> outputView) {
  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView),
      Target.WriteMode.CHECKPOINT);
  pipeline.done();
}

Source File: CompactionTask.java From kite with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
private void checkCompactable(View<T> view) {
  Dataset<T> dataset = view.getDataset();
  if (!(dataset instanceof Replaceable)) {
    throw new IllegalArgumentException("Cannot compact dataset: " + dataset);
  }
  Replaceable<View<T>> replaceable = ((Replaceable<View<T>>) dataset);
  Preconditions.checkArgument(replaceable.canReplace(view),
      "Cannot compact view: " + view);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test(expected = CrunchRuntimeException.class)
public void testWriteModeDefaultFailsWithExisting() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 1, 0);
  writeTestUsers(outputDataset, 1, 0);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
}

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
private static boolean canReplace(View<?> view) {
  if (Hadoop.isHadoop1()) {
    // can't use replace because it is called in the OutputCommitter.
    return false;
  }
  Dataset<?> dataset = view.getDataset();
  return (dataset instanceof Replaceable &&
      ((Replaceable<View<?>>) dataset).canReplace(view));
}

org.kitesdk.data.View Java Examples