org.apache.crunch.Target Java Examples
The following examples show how to use
org.apache.crunch.Target.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); View<GenericRecord> inputView = inputDataset .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7")) .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7")); Assert.assertEquals(6, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 6, 2); }
Example #2
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testMultipleFileReadingFromCrunch() throws IOException { Dataset<Record> inputDatasetA = repo.create("ns", "inA", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> inputDatasetB = repo.create("ns", "inB", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDatasetA, 5, 0); writeTestUsers(inputDatasetB, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> dataA = pipeline.read( CrunchDatasets.asSource(inputDatasetA)); PCollection<GenericData.Record> dataB = pipeline.read( CrunchDatasets.asSource(inputDatasetB)); pipeline.write(dataA.union(dataB), CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #3
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testWriteModeOverwrite() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 1, 0); writeTestUsers(outputDataset, 1, 1); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget((View<Record>) outputDataset), Target.WriteMode.OVERWRITE); pipeline.run(); checkTestUsers(outputDataset, 1); }
Example #4
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testDatasetUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(), GenericData.Record.class)); pipeline.write(data, CrunchDatasets.asTarget( new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(10, datasetSize(outputDataset)); }
Example #5
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetViewProvidedPartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); View<Record> inputView = inputDataset.with("version", "test-version-0"); writeTestUsers(inputView, 1); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("version", "test-version-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #6
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testTargetView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); View<Record> outputView = outputDataset.with("username", "test-0"); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #7
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testSourceView() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-0"); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #8
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSourceAndTarget() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, true); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #9
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testPartitionedSource() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); }
Example #10
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGenericParquet() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).format(Formats.PARQUET).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #11
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGeneric() throws IOException { Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); // write two files, each of 5 records writeTestUsers(inputDataset, 5, 0); writeTestUsers(inputDataset, 5, 5); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkTestUsers(outputDataset, 10); }
Example #12
Source File: TestCrunchDatasetsHBase.java From kite with Apache License 2.0 | 6 votes |
@Test public void testGeneric() throws IOException { String datasetName = tableName + ".TestGenericEntity"; DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); writeRecords(inputDataset, 10); Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf()); PCollection<GenericRecord> data = pipeline.read( CrunchDatasets.asSource(inputDataset)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); checkRecords(outputDataset, 10, 0); }
Example #13
Source File: MemPipelineUnitTest.java From tutorials with MIT License | 6 votes |
@Test @Ignore("Requires Hadoop binaries") public void givenCollection_whenWriteCalled_fileWrittenSuccessfully() throws IOException { PCollection<String> inputStrings = MemPipeline.collectionOf("Hello", "Apache", "Crunch", Calendar.getInstance() .toString()); final String outputFilePath = createOutputPath(); Target target = To.textFile(outputFilePath); inputStrings.write(target); Pipeline pipeline = MemPipeline.getInstance(); PCollection<String> lines = pipeline.readTextFile(outputFilePath); assertIterableEquals(inputStrings.materialize(), lines.materialize()); }
Example #14
Source File: TransformTask.java From kite with Apache License 2.0 | 5 votes |
/** * Set the output write mode: default, overwrite, or append. * * @param mode the output write mode * @return this for method chaining */ public TransformTask setWriteMode(Target.WriteMode mode) { Preconditions.checkArgument(mode != Target.WriteMode.CHECKPOINT, "Checkpoint is not an allowed write mode"); this.mode = mode; return this; }
Example #15
Source File: StagingToPersistent.java From kite-examples with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { final long startOfToday = startOfDay(); // the destination dataset Dataset<Record> persistent = Datasets.load( "dataset:file:/tmp/data/logs", Record.class); // the source: anything before today in the staging area Dataset<Record> staging = Datasets.load( "dataset:file:/tmp/data/logs_staging", Record.class); View<Record> ready = staging.toBefore("timestamp", startOfToday); ReadableSource<Record> source = CrunchDatasets.asSource(ready); PCollection<Record> stagedLogs = read(source); getPipeline().write(stagedLogs, CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND); PipelineResult result = run(); if (result.succeeded()) { // remove the source data partition from staging ready.deleteAll(); return 0; } else { return 1; } }
Example #16
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }
Example #17
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testSignalReadyOutputView() { Assume.assumeTrue(!Hadoop.isHadoop1()); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).build()); writeTestUsers(inputDataset, 10); View<Record> inputView = inputDataset.with("username", "test-8", "test-9"); View<Record> outputView = outputDataset.with("username", "test-8", "test-9"); Assert.assertEquals(2, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(2, datasetSize(outputView)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
Example #18
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testViewUris() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); URI sourceViewUri = new URIBuilder(repo.getUri(), "ns", "in").with("username", "test-0").build(); View<Record> inputView = Datasets.<Record, Dataset<Record>> load(sourceViewUri, Record.class); Assert.assertEquals(1, datasetSize(inputView)); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets .asSource(sourceViewUri, GenericData.Record.class)); URI targetViewUri = new URIBuilder(repo.getUri(), "ns", "out").with( "email", "email-0").build(); pipeline.write(data, CrunchDatasets.asTarget(targetViewUri), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(1, datasetSize(outputDataset)); }
Example #19
Source File: TestCrunchDatasets.java From kite with Apache License 2.0 | 5 votes |
@Test public void testPartitionedSourceAndTargetWritingToTopLevel() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder() .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build()); writeTestUsers(inputDataset, 10); PartitionKey key = new PartitionKey(0); Dataset<Record> inputPart0 = ((PartitionedDataset<Record>) inputDataset).getPartition(key, false); Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read( CrunchDatasets.asSource(inputPart0)); pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND); pipeline.run(); Assert.assertEquals(5, datasetSize(outputDataset)); // check all records are in the correct partition Dataset<Record> outputPart0 = ((PartitionedDataset<Record>) outputDataset).getPartition(key, false); Assert.assertNotNull(outputPart0); Assert.assertEquals(5, datasetSize(outputPart0)); }
Example #20
Source File: CopyCommand.java From kite with Apache License 2.0 | 4 votes |
@Override public int run() throws IOException { Preconditions.checkArgument(datasets != null && datasets.size() > 1, "Source and target datasets are required"); Preconditions.checkArgument(datasets.size() == 2, "Cannot copy multiple datasets"); View<GenericRecord> dest = load(datasets.get(1)); View<GenericRecord> source = load(datasets.get(0)) .asSchema(dest.getSchema()); CopyTask task = new CopyTask<GenericRecord>(source, dest); task.setConf(getConf()); if (noCompaction) { task.noCompaction(); } if (numWriters >= 0) { task.setNumWriters(numWriters); } if (filesPerPartition > 0) { task.setFilesPerPartition(filesPerPartition); } if (overwrite) { task.setWriteMode(Target.WriteMode.OVERWRITE); } PipelineResult result = task.run(); if (result.succeeded()) { console.info("Added {} records to \"{}\"", task.getCount(), datasets.get(1)); return 0; } else { return 1; } }
Example #21
Source File: JSONImportCommand.java From kite with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() == 2, "JSON path and target dataset name are required."); Path source = qualifiedPath(targets.get(0)); FileSystem sourceFS = source.getFileSystem(getConf()); Preconditions.checkArgument(sourceFS.exists(source), "JSON path does not exist: " + source); String dataset = targets.get(1); View<Record> target = load(dataset, Record.class); Schema datasetSchema = target.getDataset().getDescriptor().getSchema(); DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder() .location(source.toUri()) .schema(ColumnMappingParser.removeEmbeddedMapping( PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema))) .format("json") .build(); TemporaryFileSystemDatasetRepository repo = new TemporaryFileSystemDatasetRepository(getConf(), // ensure the same FS as the file source is used sourceFS.makeQualified(new Path("/tmp/" + UUID.randomUUID().toString())), target.getDataset().getNamespace(), UUID.randomUUID().toString()); try { FileSystemDataset<Record> jsonDataset = (FileSystemDataset) repo.create("import", "json", jsonDescriptor); Iterator<Path> iter = jsonDataset.pathIterator().iterator(); Preconditions.checkArgument(iter.hasNext(), "JSON path has no data files: " + source); TaskUtil.configure(getConf()).addJars(jars); TransformTask task; if (transform != null) { DoFn<Record, Record> transformFn; try { DynConstructors.Ctor<DoFn<Record, Record>> ctor = new DynConstructors.Builder(DoFn.class) .loader(loaderForJars(jars)) .impl(transform) .buildChecked(); transformFn = ctor.newInstance(); } catch (NoSuchMethodException e) { throw new DatasetException( "Cannot find no-arg constructor for class: " + transform, e); } task = new TransformTask<Record, Record>( jsonDataset, target, transformFn); } else { task = new CopyTask<Record>(jsonDataset, target); } task.setConf(getConf()); if (noCompaction) { task.noCompaction(); } if (numWriters >= 0) { task.setNumWriters(numWriters); } if (filesPerPartition > 0) { task.setFilesPerPartition(filesPerPartition); } if (overwrite) { task.setWriteMode(Target.WriteMode.OVERWRITE); } PipelineResult result = task.run(); if (result.succeeded()) { long count = task.getCount(); if (count > 0) { console.info("Added {} records to \"{}\"", count, dataset); } return 0; } else { return 1; } } finally { // clean up the temporary repository repo.delete(); } }
Example #22
Source File: CompactionTask.java From kite with Apache License 2.0 | 4 votes |
public CompactionTask(View<T> view) { checkCompactable(view); this.task = new CopyTask<T>(view, view); task.setWriteMode(Target.WriteMode.OVERWRITE); }
Example #23
Source File: ThriftTarget.java From hdfs2cass with Apache License 2.0 | 4 votes |
@Override public Target outputConf(final String key, final String value) { extraConf.put(key, value); return this; }
Example #24
Source File: TransformCommand.java From kite with Apache License 2.0 | 4 votes |
@Override public int run() throws IOException { Preconditions.checkArgument(datasets != null && datasets.size() > 1, "Source and target datasets are required"); Preconditions.checkArgument(datasets.size() == 2, "Cannot copy multiple datasets"); View<Record> source = load(datasets.get(0), Record.class); View<Record> dest = load(datasets.get(1), Record.class); TaskUtil.configure(getConf()).addJars(jars); TransformTask task; if (transform != null) { DoFn<Record, Record> transformFn; try { DynConstructors.Ctor<DoFn<Record, Record>> ctor = new DynConstructors.Builder(DoFn.class) .loader(loaderForJars(jars)) .impl(transform) .buildChecked(); transformFn = ctor.newInstance(); } catch (NoSuchMethodException e) { throw new DatasetException( "Cannot find no-arg constructor for class: " + transform, e); } task = new TransformTask<Record, Record>(source, dest, transformFn); } else { task = new CopyTask<Record>(source, dest); } task.setConf(getConf()); if (noCompaction) { task.noCompaction(); } if (numWriters >= 0) { task.setNumWriters(numWriters); } if (filesPerPartition > 0) { task.setFilesPerPartition(filesPerPartition); } if (overwrite) { task.setWriteMode(Target.WriteMode.OVERWRITE); } PipelineResult result = task.run(); if (result.succeeded()) { console.info("Added {} records to \"{}\"", task.getCount(), datasets.get(1)); return 0; } else { return 1; } }
Example #25
Source File: CQLTarget.java From hdfs2cass with Apache License 2.0 | 4 votes |
@Override public Target outputConf(final String key, final String value) { extraConf.put(key, value); return this; }
Example #26
Source File: DatasetTarget.java From kite with Apache License 2.0 | 4 votes |
@Override public Target outputConf(String key, String value) { formatBundle.set(key, value); return this; }
Example #27
Source File: CreateSessions.java From kite-examples with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { // Turn debug on while in development. getPipeline().enableDebug(); getPipeline().getConfiguration().set("crunch.log.job.progress", "true"); Dataset<StandardEvent> eventsDataset = Datasets.load( "dataset:hdfs:/tmp/data/default/events", StandardEvent.class); View<StandardEvent> eventsToProcess; if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) { // get the current minute Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); long currentMinute = cal.getTimeInMillis(); // restrict events to before the current minute // in the workflow, this also has a lower bound for the timestamp eventsToProcess = eventsDataset.toBefore("timestamp", currentMinute); } else if (isView(args[0])) { eventsToProcess = Datasets.load(args[0], StandardEvent.class); } else { eventsToProcess = FileSystemDatasets.viewForPath(eventsDataset, new Path(args[0])); } if (eventsToProcess.isEmpty()) { LOG.info("No records to process."); return 0; } // Create a parallel collection from the working partition PCollection<StandardEvent> events = read( CrunchDatasets.asSource(eventsToProcess)); // Group events by user and cookie id, then create a session for each group PCollection<Session> sessions = events .by(new GetSessionKey(), Avros.strings()) .groupByKey() .parallelDo(new MakeSession(), Avros.specifics(Session.class)); // Write the sessions to the "sessions" Dataset getPipeline().write(sessions, CrunchDatasets.asTarget("dataset:hive:/tmp/data/default/sessions"), Target.WriteMode.APPEND); return run().succeeded() ? 0 : 1; }
Example #28
Source File: CrunchDatasets.java From kite with Apache License 2.0 | 2 votes |
/** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(String uri) { return asTarget(URI.create(uri)); }
Example #29
Source File: CrunchDatasets.java From kite with Apache License 2.0 | 2 votes |
/** * Expose the given {@link View} as a Crunch {@link Target}. * * @param view the view to write to * @param <E> the type of entity stored in the view * @return a {@link Target} for the view * * @since 0.14.0 */ public static <E> Target asTarget(View<E> view) { return new DatasetTarget<E>(view); }
Example #30
Source File: CrunchDatasets.java From kite with Apache License 2.0 | 2 votes |
/** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(URI uri) { return new DatasetTarget<Object>(uri); }