org.apache.crunch.Target Java Examples

The following examples show how to use org.apache.crunch.Target. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  View<GenericRecord> inputView = inputDataset
      .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7"))
      .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7"));
  Assert.assertEquals(6, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 6, 2);
}
 
Example #2
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultipleFileReadingFromCrunch() throws IOException {
  Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDatasetA, 5, 0);
  writeTestUsers(inputDatasetB, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> dataA = pipeline.read(
      CrunchDatasets.asSource(inputDatasetA));
  PCollection<GenericData.Record> dataB = pipeline.read(
      CrunchDatasets.asSource(inputDatasetB));
  pipeline.write(dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #3
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteModeOverwrite() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 1, 0);
  writeTestUsers(outputDataset, 1, 1);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset),
      Target.WriteMode.OVERWRITE);

  pipeline.run();

  checkTestUsers(outputDataset, 1);
}
 
Example #4
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testDatasetUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(),
          GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(
      new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(10, datasetSize(outputDataset));
}
 
Example #5
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testTargetViewProvidedPartition() throws IOException {
    PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build();

    Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
    Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

    View<Record> inputView = inputDataset.with("version", "test-version-0");

    writeTestUsers(inputView, 1);

    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("version", "test-version-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(
            CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #6
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testTargetView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));
  View<Record> outputView = outputDataset.with("username", "test-0");

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #7
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #8
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
Example #9
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}
 
Example #10
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #11
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #12
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGeneric() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 10, 0);
}
 
Example #13
Source File: MemPipelineUnitTest.java    From tutorials with MIT License 6 votes vote down vote up
@Test
@Ignore("Requires Hadoop binaries")
public void givenCollection_whenWriteCalled_fileWrittenSuccessfully()
    throws IOException {
    PCollection<String> inputStrings = MemPipeline.collectionOf("Hello",
        "Apache", "Crunch", Calendar.getInstance()
            .toString());
    final String outputFilePath = createOutputPath();
    Target target = To.textFile(outputFilePath);

    inputStrings.write(target);

    Pipeline pipeline = MemPipeline.getInstance();
    PCollection<String> lines = pipeline.readTextFile(outputFilePath);
    assertIterableEquals(inputStrings.materialize(), lines.materialize());
}
 
Example #14
Source File: TransformTask.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Set the output write mode: default, overwrite, or append.
 *
 * @param mode the output write mode
 * @return this for method chaining
 */
public TransformTask setWriteMode(Target.WriteMode mode) {
  Preconditions.checkArgument(mode != Target.WriteMode.CHECKPOINT,
      "Checkpoint is not an allowed write mode");
  this.mode = mode;
  return this;
}
 
Example #15
Source File: StagingToPersistent.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  final long startOfToday = startOfDay();

  // the destination dataset
  Dataset<Record> persistent = Datasets.load(
      "dataset:file:/tmp/data/logs", Record.class);

  // the source: anything before today in the staging area
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);
  View<Record> ready = staging.toBefore("timestamp", startOfToday);

  ReadableSource<Record> source = CrunchDatasets.asSource(ready);

  PCollection<Record> stagedLogs = read(source);

  getPipeline().write(stagedLogs,
      CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND);

  PipelineResult result = run();

  if (result.succeeded()) {
    // remove the source data partition from staging
    ready.deleteAll();
    return 0;
  } else {
    return 1;
  }
}
 
Example #16
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 5 votes vote down vote up
private void runCheckpointPipeline(View<Record> inputView,
    View<Record> outputView) {
  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView),
      Target.WriteMode.CHECKPOINT);
  pipeline.done();
}
 
Example #17
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testSignalReadyOutputView() {
  Assume.assumeTrue(!Hadoop.isHadoop1());
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
  View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
  Assert.assertEquals(2, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(2, datasetSize(outputView));

  Assert.assertFalse("Output dataset should not be signaled ready",
      ((Signalable)outputDataset).isReady());
  Assert.assertTrue("Output view should be signaled ready",
      ((Signalable)outputView).isReady());
}
 
Example #18
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testViewUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username",
      "test-0").build();
  View<Record> inputView = Datasets.<Record, Dataset<Record>> load(sourceViewUri,
      Record.class);
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets
      .asSource(sourceViewUri, GenericData.Record.class));
  URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with(
      "email", "email-0").build();
  pipeline.write(data, CrunchDatasets.asTarget(targetViewUri),
      Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #19
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedSourceAndTargetWritingToTopLevel() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));

  // check all records are in the correct partition
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, false);
  Assert.assertNotNull(outputPart0);
  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
Example #20
Source File: CopyCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<GenericRecord> dest = load(datasets.get(1));
  View<GenericRecord> source = load(datasets.get(0))
      .asSchema(dest.getSchema());

  CopyTask task = new CopyTask<GenericRecord>(source, dest);

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}
 
Example #21
Source File: JSONImportCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 2,
      "JSON path and target dataset name are required.");

  Path source = qualifiedPath(targets.get(0));
  FileSystem sourceFS = source.getFileSystem(getConf());
  Preconditions.checkArgument(sourceFS.exists(source),
      "JSON path does not exist: " + source);

  String dataset = targets.get(1);

  View<Record> target = load(dataset, Record.class);
  Schema datasetSchema = target.getDataset().getDescriptor().getSchema();

  DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder()
      .location(source.toUri())
      .schema(ColumnMappingParser.removeEmbeddedMapping(
          PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema)))
      .format("json")
      .build();

  TemporaryFileSystemDatasetRepository repo =
      new TemporaryFileSystemDatasetRepository(getConf(),
          // ensure the same FS as the file source is used
          sourceFS.makeQualified(new Path("/tmp/" + UUID.randomUUID().toString())),
          target.getDataset().getNamespace(),
          UUID.randomUUID().toString());

  try {
    FileSystemDataset<Record> jsonDataset =
        (FileSystemDataset) repo.create("import", "json", jsonDescriptor);

    Iterator<Path> iter = jsonDataset.pathIterator().iterator();
    Preconditions.checkArgument(iter.hasNext(),
        "JSON path has no data files: " + source);

    TaskUtil.configure(getConf()).addJars(jars);

    TransformTask task;
    if (transform != null) {
      DoFn<Record, Record> transformFn;
      try {
        DynConstructors.Ctor<DoFn<Record, Record>> ctor =
            new DynConstructors.Builder(DoFn.class)
                .loader(loaderForJars(jars))
                .impl(transform)
                .buildChecked();
        transformFn = ctor.newInstance();
      } catch (NoSuchMethodException e) {
        throw new DatasetException(
            "Cannot find no-arg constructor for class: " + transform, e);
      }
      task = new TransformTask<Record, Record>(
          jsonDataset, target, transformFn);
    } else {
      task = new CopyTask<Record>(jsonDataset, target);
    }

    task.setConf(getConf());

    if (noCompaction) {
      task.noCompaction();
    }

    if (numWriters >= 0) {
      task.setNumWriters(numWriters);
    }

    if (filesPerPartition > 0) {
      task.setFilesPerPartition(filesPerPartition);
    }

    if (overwrite) {
      task.setWriteMode(Target.WriteMode.OVERWRITE);
    }

    PipelineResult result = task.run();

    if (result.succeeded()) {
      long count = task.getCount();
      if (count > 0) {
        console.info("Added {} records to \"{}\"", count, dataset);
      }
      return 0;
    } else {
      return 1;
    }
  } finally {
    // clean up the temporary repository
    repo.delete();
  }
}
 
Example #22
Source File: CompactionTask.java    From kite with Apache License 2.0 4 votes vote down vote up
public CompactionTask(View<T> view) {
  checkCompactable(view);
  this.task = new CopyTask<T>(view, view);
  task.setWriteMode(Target.WriteMode.OVERWRITE);
}
 
Example #23
Source File: ThriftTarget.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Target outputConf(final String key, final String value) {
  extraConf.put(key, value);
  return this;
}
 
Example #24
Source File: TransformCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<Record> source = load(datasets.get(0), Record.class);
  View<Record> dest = load(datasets.get(1), Record.class);

  TaskUtil.configure(getConf()).addJars(jars);

  TransformTask task;
  if (transform != null) {
    DoFn<Record, Record> transformFn;
    try {
      DynConstructors.Ctor<DoFn<Record, Record>> ctor =
          new DynConstructors.Builder(DoFn.class)
              .loader(loaderForJars(jars))
              .impl(transform)
              .buildChecked();
      transformFn = ctor.newInstance();
    } catch (NoSuchMethodException e) {
      throw new DatasetException(
          "Cannot find no-arg constructor for class: " + transform, e);
    }
    task = new TransformTask<Record, Record>(source, dest, transformFn);
  } else {
    task = new CopyTask<Record>(source, dest);
  }

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}
 
Example #25
Source File: CQLTarget.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Target outputConf(final String key, final String value) {
  extraConf.put(key, value);
  return this;
}
 
Example #26
Source File: DatasetTarget.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public Target outputConf(String key, String value) {
  formatBundle.set(key, value);
  return this;
}
 
Example #27
Source File: CreateSessions.java    From kite-examples with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Turn debug on while in development.
  getPipeline().enableDebug();
  getPipeline().getConfiguration().set("crunch.log.job.progress", "true");

  Dataset<StandardEvent> eventsDataset = Datasets.load(
      "dataset:hdfs:/tmp/data/default/events", StandardEvent.class);

  View<StandardEvent> eventsToProcess;
  if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
    // get the current minute
    Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    long currentMinute = cal.getTimeInMillis();
    // restrict events to before the current minute
    // in the workflow, this also has a lower bound for the timestamp
    eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute);

  } else if (isView(args[0])) {
    eventsToProcess = Datasets.load(args[0], StandardEvent.class);
  } else {
    eventsToProcess = FileSystemDatasets.viewForPath(eventsDataset, new Path(args[0]));
  }

  if (eventsToProcess.isEmpty()) {
    LOG.info("No records to process.");
    return 0;
  }

  // Create a parallel collection from the working partition
  PCollection<StandardEvent> events = read(
      CrunchDatasets.asSource(eventsToProcess));

  // Group events by user and cookie id, then create a session for each group
  PCollection<Session> sessions = events
      .by(new GetSessionKey(), Avros.strings())
      .groupByKey()
      .parallelDo(new MakeSession(), Avros.specifics(Session.class));

  // Write the sessions to the "sessions" Dataset
  getPipeline().write(sessions,
      CrunchDatasets.asTarget("dataset:hive:/tmp/data/default/sessions"),
      Target.WriteMode.APPEND);

  return run().succeeded() ? 0 : 1;
}
 
Example #28
Source File: CrunchDatasets.java    From kite with Apache License 2.0 2 votes vote down vote up
/**
 * Expose the {@link Dataset} or {@link View} represented by the given
 * URI as a Crunch {@link Target}.
 *
 * @param uri the dataset or view URI
 * @return a {@link Target} for the dataset or view
 *
 * @since 0.15.0
 */
public static Target asTarget(String uri) {
  return asTarget(URI.create(uri));
}
 
Example #29
Source File: CrunchDatasets.java    From kite with Apache License 2.0 2 votes vote down vote up
/**
 * Expose the given {@link View} as a Crunch {@link Target}.
 *
 * @param view the view to write to
 * @param <E>  the type of entity stored in the view
 * @return a {@link Target} for the view
 *
 * @since 0.14.0
 */
public static <E> Target asTarget(View<E> view) {
  return new DatasetTarget<E>(view);
}
 
Example #30
Source File: CrunchDatasets.java    From kite with Apache License 2.0 2 votes vote down vote up
/**
 * Expose the {@link Dataset} or {@link View} represented by the given
 * URI as a Crunch {@link Target}.
 *
 * @param uri the dataset or view URI
 * @return a {@link Target} for the dataset or view
 *
 * @since 0.15.0
 */
public static Target asTarget(URI uri) {
  return new DatasetTarget<Object>(uri);
}