org.apache.crunch.PCollection Java Examples
The following examples show how to use
org.apache.crunch.PCollection.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #2
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGeneric() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 10, 0); }
Example #3
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); View<GenericRecord> inputView = inputDataset .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7")) .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7")); Assert.assertEquals(6, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 6, 2); }
Example #4
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGeneric() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #5
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGenericParquet() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #6
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
Example #7
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSourceAndTarget() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, true); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #8
Source File: Thrift.java From hdfs2cass with Apache License 2.0 | 6 votes |
public static <T extends SpecificRecord> PCollection<ThriftRecord> byFieldNames( final PCollection<T> collection, final String rowKeyFieldName, final String ttlFieldName, final String timestampFieldName ) { final Class<T> recordType = collection.getPType().getTypeClass(); T record; try { record = recordType.getConstructor().newInstance(); } catch (Exception e) { throw new RuntimeException("Could not create an instance of the record to determine it's schema", e); } ThriftByFieldNamesFn<T> doFn = new ThriftByFieldNamesFn<T>(record.getSchema(), rowKeyFieldName, ttlFieldName, timestampFieldName); return collection.parallelDo(doFn, ThriftRecord.PTYPE); }
Example #9
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #10
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetViewProvidedPartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); View<Record> inputView = inputDataset.with("version", "test-version-0"); writeTestUsers(inputView, 1); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("version", "test-version-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #11
Source File: MemPipelineUnitTest.java From tutorials with MIT License | 6 votes |
@Test @Ignore("Requires Hadoop binaries") public void givenCollection_whenWriteCalled_fileWrittenSuccessfully() throws IOException { PCollection<String> inputStrings = MemPipeline.collectionOf("Hello", "Apache", "Crunch", Calendar.getInstance() .toString()); final String outputFilePath = createOutputPath(); Target target = To.textFile(outputFilePath); inputStrings.write(target); Pipeline pipeline = MemPipeline.getInstance(); PCollection<String> lines = pipeline.readTextFile(outputFilePath); assertIterableEquals(inputStrings.materialize(), lines.materialize()); }
Example #12
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testDatasetUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(), GenericData.Record.class)); pipeline.write(data, CrunchDatasets.asTarget( new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(10, datasetSize(outputDataset)); }
Example #13
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
Example #14
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testMultipleFileReadingFromCrunch() throws IOException { Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDatasetA, 5, 0); writeTestUsers(inputDatasetB, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> dataA = pipeline.read( CrunchDatasets.asSource(inputDatasetA)); PCollection<GenericData.Record> dataB = pipeline.read( CrunchDatasets.asSource(inputDatasetB)); pipeline.write(dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #15
Source File: CrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) { PType<E> type = collection.getPType(); PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls()); PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType); PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); }
Example #16
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testViewUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username", "test-0").build(); View<Record> inputView = Datasets.<Record, Dataset<Record>> load(sourceViewUri, Record.class); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets .asSource(sourceViewUri, GenericData.Record.class)); URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with( "email", "email-0").build(); pipeline.write(data, CrunchDatasets.asTarget(targetViewUri), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #17
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testPartitionedSourceAndTargetWritingToTopLevel() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); // check all records are in the correct partition Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, false); Assert.assertNotNull(outputPart0); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #18
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test(expected = CrunchRuntimeException.class) public void testWriteModeDefaultFailsWithExisting() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 0); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset)); }
Example #19
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
Example #20
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }
Example #21
Source File: JoinFilterExampleCrunch.java From hadoop-arch-book with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { String fooInputPath = args[0]; String barInputPath = args[1]; String outputPath = args[2]; int fooValMax = Integer.parseInt(args[3]); int joinValMax = Integer.parseInt(args[4]); int numberOfReducers = Integer.parseInt(args[5]); Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1> PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2> PCollection<String> barLines = pipeline.readTextFile(barInputPath); PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3> new FooIndicatorFn(), Avros.tableOf(Avros.longs(), Avros.pairs(Avros.longs(), Avros.ints()))); fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4> PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(), Avros.tableOf(Avros.longs(), Avros.ints())); DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5> new DefaultJoinStrategy <Long, Pair<Long, Integer>, Integer> (numberOfReducers); PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6> .join(fooTable, barTable, JoinType.INNER_JOIN); PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax)); filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7> PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
Example #22
Source File: StagingToPersistent.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { final long startOfToday = startOfDay(); // the destination dataset Dataset<Record> persistent = Datasets.load( "dataset:file:/tmp/data/logs", Record.class); // the source: anything before today in the staging area Dataset<Record> staging = Datasets.load( "dataset:file:/tmp/data/logs_staging", Record.class); View<Record> ready = staging.toBefore("timestamp", startOfToday); ReadableSource<Record> source = CrunchDatasets.asSource(ready); PCollection<Record> stagedLogs = read(source); getPipeline().write(stagedLogs, CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND); PipelineResult result = run(); if (result.succeeded()) { // remove the source data partition from staging ready.deleteAll(); return 0; } else { return 1; } }
Example #23
Source File: WordCount.java From tutorials with MIT License | 5 votes |
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output"); System.err.println(); GenericOptionsParser.printGenericCommandUsage(System.err); return 1; } String inputPath = args[0]; String outputPath = args[1]; // Create an object to coordinate pipeline creation and execution. Pipeline pipeline = new MRPipeline(WordCount.class, getConf()); // Reference a given text file as a collection of Strings. PCollection<String> lines = pipeline.readTextFile(inputPath); // Define a function that splits each line in a PCollection of Strings into // a PCollection made up of the individual words in the file. // The second argument sets the serialization format. PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings()); // Take the collection of words and remove known stop words. PCollection<String> noStopWords = words.filter(new StopWordFilter()); // The count method applies a series of Crunch primitives and returns // a map of the unique words in the input PCollection to their counts. PTable<String, Long> counts = noStopWords.count(); // Instruct the pipeline to write the resulting counts to a text file. pipeline.writeTextFile(counts, outputPath); // Execute the pipeline as a MapReduce. PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
Example #24
Source File: ToUpperCaseWithCounterFnUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void whenFunctionCalled_counterIncementendForChangedValues() { PCollection<String> inputStrings = MemPipeline.collectionOf("This", "is", "a", "TEST", "string"); PCollection<String> upperCaseStrings = inputStrings.parallelDo(new ToUpperCaseWithCounterFn(), Writables.strings()); assertEquals(ImmutableList.of("THIS", "IS", "A", "TEST", "STRING"), Lists.newArrayList(upperCaseStrings.materialize())); assertEquals(4L, MemPipeline.getCounters() .findCounter("UpperCase", "modified") .getValue()); }
Example #25
Source File: MemPipelineUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void givenPipeLineAndSource_whenSourceRead_thenExpectedNumberOfRecordsRead() { Pipeline pipeline = MemPipeline.getInstance(); Source<String> source = From.textFile(INPUT_FILE_PATH); PCollection<String> lines = pipeline.read(source); assertEquals(21, lines.asCollection() .getValue() .size()); }
Example #26
Source File: LegacyHdfs2Cass.java From hdfs2cass with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { new JCommander(this, args); URI outputUri = URI.create(output); // Our crunch job is a MapReduce job Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf()); // Parse & fetch info about target Cassandra cluster CassandraParams params = CassandraParams.parse(outputUri); // Read records from Avro files in inputFolder PCollection<ByteBuffer> records = pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class))); // Transform the input String protocol = outputUri.getScheme(); if (protocol.equalsIgnoreCase("thrift")) { records // First convert ByteBuffers to ThriftRecords .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE) // Then group the ThriftRecords in preparation for writing them .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE) .groupByKey(params.createGroupingOptions()) // Finally write the ThriftRecords to Cassandra .write(new ThriftTarget(outputUri, params)); } else if (protocol.equalsIgnoreCase("cql")) { records // In case of CQL, convert ByteBuffers to CQLRecords .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE) .by(params.getKeyFn(), Avros.bytes()) .groupByKey(params.createGroupingOptions()) .write(new CQLTarget(outputUri, params)); } // Execute the pipeline PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
Example #27
Source File: StopWordFilterUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void givenWordCollection_whenFiltered_thenStopWordsRemoved() { PCollection<String> words = MemPipeline.collectionOf("This", "is", "a", "test", "sentence"); PCollection<String> noStopWords = words.filter(new StopWordFilter()); assertEquals(ImmutableList.of("This", "test", "sentence"), Lists.newArrayList(noStopWords.materialize())); }
Example #28
Source File: MemPipelineUnitTest.java From tutorials with MIT License | 5 votes |
@Test @Ignore("Requires Hadoop binaries") public void givenPipeLine_whenWriteTextFileCalled_fileWrittenSuccessfully() throws IOException { Pipeline pipeline = MemPipeline.getInstance(); PCollection<String> inputStrings = MemPipeline.collectionOf("Hello", "Apache", "Crunch", Calendar.getInstance() .toString()); final String outputFilePath = createOutputPath(); pipeline.writeTextFile(inputStrings, outputFilePath); PCollection<String> lines = pipeline.readTextFile(outputFilePath); assertIterableEquals(inputStrings.materialize(), lines.materialize()); }
Example #29
Source File: MemPipelineUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void givenPipeLine_whenTextFileRead_thenExpectedNumberOfRecordsRead() { Pipeline pipeline = MemPipeline.getInstance(); PCollection<String> lines = pipeline.readTextFile(INPUT_FILE_PATH); assertEquals(21, lines.asCollection() .getValue() .size()); }
Example #30
Source File: CreateSessions.java From kite-examples with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { // Turn debug on while in development. getPipeline().enableDebug(); getPipeline().getConfiguration().set("crunch.log.job.progress", "true"); Dataset<StandardEvent> eventsDataset = Datasets.load( "dataset:hdfs:/tmp/data/default/events", StandardEvent.class); View<StandardEvent> eventsToProcess; if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) { // get the current minute Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); long currentMinute = cal.getTimeInMillis(); // restrict events to before the current minute // in the workflow, this also has a lower bound for the timestamp eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute); } else if (isView(args[0])) { eventsToProcess = Datasets.load(args[0], StandardEvent.class); } else { eventsToProcess = FileSystemDatasets.viewForPath(eventsDataset, new Path(args[0])); } if (eventsToProcess.isEmpty()) { LOG.info("No records to process."); return 0; } // Create a parallel collection from the working partition PCollection<StandardEvent> events = read( CrunchDatasets.asSource(eventsToProcess)); // Group events by user and cookie id, then create a session for each group PCollection<Session> sessions = events .by(new GetSessionKey(), Avros.strings()) .groupByKey() .parallelDo(new MakeSession(), Avros.specifics(Session.class)); // Write the sessions to the "sessions" Dataset getPipeline().write(sessions, CrunchDatasets.asTarget("dataset:hive:/tmp/data/default/sessions"), Target.WriteMode.APPEND); return run().succeeded() ? 0 : 1; }