org.apache.crunch.PCollection Java Exaples

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testTargetView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));
  View<Record> outputView = outputDataset.with("username", "test-0");

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

6 votes

@Test
public void testGeneric() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 10, 0);
}

Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  View<GenericRecord> inputView = inputDataset
      .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7"))
      .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7"));
  Assert.assertEquals(6, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 6, 2);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}

Source File: Thrift.java From hdfs2cass with Apache License 2.0

6 votes

public static <T extends SpecificRecord> PCollection<ThriftRecord> byFieldNames(
    final PCollection<T> collection,
    final String rowKeyFieldName,
    final String ttlFieldName,
    final String timestampFieldName
) {
  final Class<T> recordType = collection.getPType().getTypeClass();
  T record;
  try {
    record = recordType.getConstructor().newInstance();
  } catch (Exception e) {
    throw new RuntimeException("Could not create an instance of the record to determine it's schema", e);
  }

  ThriftByFieldNamesFn<T> doFn = new ThriftByFieldNamesFn<T>(record.getSchema(), rowKeyFieldName, ttlFieldName, timestampFieldName);
  return collection.parallelDo(doFn, ThriftRecord.PTYPE);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testTargetViewProvidedPartition() throws IOException {
    PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build();

    Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
    Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

    View<Record> inputView = inputDataset.with("version", "test-version-0");

    writeTestUsers(inputView, 1);

    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("version", "test-version-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(
            CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: MemPipelineUnitTest.java From tutorials with MIT License

6 votes

@Test
@Ignore("Requires Hadoop binaries")
public void givenCollection_whenWriteCalled_fileWrittenSuccessfully()
    throws IOException {
    PCollection<String> inputStrings = MemPipeline.collectionOf("Hello",
        "Apache", "Crunch", Calendar.getInstance()
            .toString());
    final String outputFilePath = createOutputPath();
    Target target = To.textFile(outputFilePath);

    inputStrings.write(target);

    Pipeline pipeline = MemPipeline.getInstance();
    PCollection<String> lines = pipeline.readTextFile(outputFilePath);
    assertIterableEquals(inputStrings.materialize(), lines.materialize());
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testDatasetUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(),
          GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(
      new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(10, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testWriteModeOverwrite() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 1, 0);
  writeTestUsers(outputDataset, 1, 1);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset),
      Target.WriteMode.OVERWRITE);

  pipeline.run();

  checkTestUsers(outputDataset, 1);
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

6 votes

@Test
public void testMultipleFileReadingFromCrunch() throws IOException {
  Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDatasetA, 5, 0);
  writeTestUsers(inputDatasetB, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> dataA = pipeline.read(
      CrunchDatasets.asSource(inputDatasetA));
  PCollection<GenericData.Record> dataB = pipeline.read(
      CrunchDatasets.asSource(inputDatasetB));
  pipeline.write(dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}

Source File: CrunchDatasets.java From kite with Apache License 2.0

5 votes

private static <E> PCollection<E> partition(PCollection<E> collection,
                                            int numReducers) {
  PType<E> type = collection.getPType();
  PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
  PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
  PGroupedTable<E, Void> grouped =
      numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
  return grouped.ungroup().keys();
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test
public void testViewUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username",
      "test-0").build();
  View<Record> inputView = Datasets.<Record, Dataset<Record>> load(sourceViewUri,
      Record.class);
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets
      .asSource(sourceViewUri, GenericData.Record.class));
  URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with(
      "email", "email-0").build();
  pipeline.write(data, CrunchDatasets.asTarget(targetViewUri),
      Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test
public void testPartitionedSourceAndTargetWritingToTopLevel() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));

  // check all records are in the correct partition
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, false);
  Assert.assertNotNull(outputPart0);
  Assert.assertEquals(5, datasetSize(outputPart0));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test(expected = CrunchRuntimeException.class)
public void testWriteModeDefaultFailsWithExisting() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 1, 0);
  writeTestUsers(outputDataset, 1, 0);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset));
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

@Test
public void testSignalReadyOutputView() {
  Assume.assumeTrue(!Hadoop.isHadoop1());
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
  View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
  Assert.assertEquals(2, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(2, datasetSize(outputView));

  Assert.assertFalse("Output dataset should not be signaled ready",
      ((Signalable)outputDataset).isReady());
  Assert.assertTrue("Output view should be signaled ready",
      ((Signalable)outputView).isReady());
}

Source File: TestCrunchDatasets.java From kite with Apache License 2.0

5 votes

private void runCheckpointPipeline(View<Record> inputView,
    View<Record> outputView) {
  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView),
      Target.WriteMode.CHECKPOINT);
  pipeline.done();
}

Source File: JoinFilterExampleCrunch.java From hadoop-arch-book with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }

Source File: StagingToPersistent.java From kite-examples with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  final long startOfToday = startOfDay();

  // the destination dataset
  Dataset<Record> persistent = Datasets.load(
      "dataset:file:/tmp/data/logs", Record.class);

  // the source: anything before today in the staging area
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);
  View<Record> ready = staging.toBefore("timestamp", startOfToday);

  ReadableSource<Record> source = CrunchDatasets.asSource(ready);

  PCollection<Record> stagedLogs = read(source);

  getPipeline().write(stagedLogs,
      CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND);

  PipelineResult result = run();

  if (result.succeeded()) {
    // remove the source data partition from staging
    ready.deleteAll();
    return 0;
  } else {
    return 1;
  }
}

Source File: WordCount.java From tutorials with MIT License

5 votes

public int run(String[] args) throws Exception {

        if (args.length != 2) {
            System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
            System.err.println();
            GenericOptionsParser.printGenericCommandUsage(System.err);
            return 1;
        }

        String inputPath = args[0];
        String outputPath = args[1];

        // Create an object to coordinate pipeline creation and execution.
        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

        // Reference a given text file as a collection of Strings.
        PCollection<String> lines = pipeline.readTextFile(inputPath);

        // Define a function that splits each line in a PCollection of Strings into
        // a PCollection made up of the individual words in the file.
        // The second argument sets the serialization format.
        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

        // Take the collection of words and remove known stop words.
        PCollection<String> noStopWords = words.filter(new StopWordFilter());

        // The count method applies a series of Crunch primitives and returns
        // a map of the unique words in the input PCollection to their counts.
        PTable<String, Long> counts = noStopWords.count();

        // Instruct the pipeline to write the resulting counts to a text file.
        pipeline.writeTextFile(counts, outputPath);

        // Execute the pipeline as a MapReduce.
        PipelineResult result = pipeline.done();

        return result.succeeded() ? 0 : 1;
    }

Source File: ToUpperCaseWithCounterFnUnitTest.java From tutorials with MIT License

5 votes

@Test
public void whenFunctionCalled_counterIncementendForChangedValues() {
    PCollection<String> inputStrings = MemPipeline.collectionOf("This", "is", "a", "TEST", "string");
    PCollection<String> upperCaseStrings = inputStrings.parallelDo(new ToUpperCaseWithCounterFn(), Writables.strings());

    assertEquals(ImmutableList.of("THIS", "IS", "A", "TEST", "STRING"), Lists.newArrayList(upperCaseStrings.materialize()));
    assertEquals(4L, MemPipeline.getCounters()
        .findCounter("UpperCase", "modified")
        .getValue());
}

Source File: MemPipelineUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenPipeLineAndSource_whenSourceRead_thenExpectedNumberOfRecordsRead() {
    Pipeline pipeline = MemPipeline.getInstance();
    Source<String> source = From.textFile(INPUT_FILE_PATH);

    PCollection<String> lines = pipeline.read(source);

    assertEquals(21, lines.asCollection()
        .getValue()
        .size());
}

Source File: LegacyHdfs2Cass.java From hdfs2cass with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  // Read records from Avro files in inputFolder
  PCollection<ByteBuffer> records =
      pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));

  // Transform the input
  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
        // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}

Source File: StopWordFilterUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenWordCollection_whenFiltered_thenStopWordsRemoved() {
    PCollection<String> words = MemPipeline.collectionOf("This", "is", "a",
        "test", "sentence");

    PCollection<String> noStopWords = words.filter(new StopWordFilter());

    assertEquals(ImmutableList.of("This", "test", "sentence"),
        Lists.newArrayList(noStopWords.materialize()));
}

Source File: MemPipelineUnitTest.java From tutorials with MIT License

5 votes

@Test
@Ignore("Requires Hadoop binaries")
public void givenPipeLine_whenWriteTextFileCalled_fileWrittenSuccessfully()
    throws IOException {
    Pipeline pipeline = MemPipeline.getInstance();
    PCollection<String> inputStrings = MemPipeline.collectionOf("Hello",
        "Apache", "Crunch", Calendar.getInstance()
            .toString());
    final String outputFilePath = createOutputPath();

    pipeline.writeTextFile(inputStrings, outputFilePath);

    PCollection<String> lines = pipeline.readTextFile(outputFilePath);
    assertIterableEquals(inputStrings.materialize(), lines.materialize());
}

Source File: MemPipelineUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenPipeLine_whenTextFileRead_thenExpectedNumberOfRecordsRead() {
    Pipeline pipeline = MemPipeline.getInstance();

    PCollection<String> lines = pipeline.readTextFile(INPUT_FILE_PATH);

    assertEquals(21, lines.asCollection()
        .getValue()
        .size());
}

Source File: CreateSessions.java From kite-examples with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
  // Turn debug on while in development.
  getPipeline().enableDebug();
  getPipeline().getConfiguration().set("crunch.log.job.progress", "true");

  Dataset<StandardEvent> eventsDataset = Datasets.load(
      "dataset:hdfs:/tmp/data/default/events", StandardEvent.class);

  View<StandardEvent> eventsToProcess;
  if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
    // get the current minute
    Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    long currentMinute = cal.getTimeInMillis();
    // restrict events to before the current minute
    // in the workflow, this also has a lower bound for the timestamp
    eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute);

  } else if (isView(args[0])) {
    eventsToProcess = Datasets.load(args[0], StandardEvent.class);
  } else {
    eventsToProcess = FileSystemDatasets.viewForPath(eventsDataset, new Path(args[0]));
  }

  if (eventsToProcess.isEmpty()) {
    LOG.info("No records to process.");
    return 0;
  }

  // Create a parallel collection from the working partition
  PCollection<StandardEvent> events = read(
      CrunchDatasets.asSource(eventsToProcess));

  // Group events by user and cookie id, then create a session for each group
  PCollection<Session> sessions = events
      .by(new GetSessionKey(), Avros.strings())
      .groupByKey()
      .parallelDo(new MakeSession(), Avros.specifics(Session.class));

  // Write the sessions to the "sessions" Dataset
  getPipeline().write(sessions,
      CrunchDatasets.asTarget("dataset:hive:/tmp/data/default/sessions"),
      Target.WriteMode.APPEND);

  return run().succeeded() ? 0 : 1;
}

org.apache.crunch.PCollection Java Examples