cascading.tap.Tap Java Examples
The following examples show how to use
cascading.tap.Tap.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TapDataWriter.java From plunger with Apache License 2.0 | 6 votes |
/** Writes the {@link Tuple Tuples} provided in the {@link Data} instance to the supplied {@link Tap}. */ public Tap<?, ?, ?> toTap(Tap<?, ?, ?> tap) throws IOException { Class<?> tapConfigClass = TapTypeUtil.getTapConfigClass(tap); if (Configuration.class.equals(tapConfigClass)) { if (tap instanceof BasePartitionTap) { writeToHadoopPartitionTap(tap); } else { writeToHadoopTap(tap); } } else if (Properties.class.equals(tapConfigClass)) { writeToLocalTap(tap); } else { throw new IllegalArgumentException("Unsupported tap type: " + tap.getClass()); } return tap; }
Example #2
Source File: TapDataWriter.java From plunger with Apache License 2.0 | 6 votes |
private void writeToHadoopPartitionTap(Tap<?, ?, ?> tap) throws IOException { @SuppressWarnings("unchecked") BasePartitionTap<JobConf, ?, ?> hadoopTap = (BasePartitionTap<JobConf, ?, ?>) tap; JobConf conf = new JobConf(); // Avoids deletion of results when using a partition tap (close() will delete the _temporary before the copy has // been done if not in a flow) HadoopUtil.setIsInflow(conf); HadoopFlowProcess flowProcess = new HadoopFlowProcess(conf); hadoopTap.sinkConfInit(flowProcess, conf); TupleEntryCollector collector = hadoopTap.openForWrite(flowProcess); for (TupleEntry tuple : data.asTupleEntryList()) { collector.add(tuple); } collector.close(); // We need to clean up the '_temporary' folder BasePartitionTap<JobConf, ?, ?> partitionTap = hadoopTap; @SuppressWarnings("unchecked") String basePath = partitionTap.getParent().getFullIdentifier(flowProcess); deleteTemporaryPath(new Path(basePath), FileSystem.get(conf)); }
Example #3
Source File: BottomUpBoundariesNodePartitioner.java From cascading-flink with Apache License 2.0 | 6 votes |
public BottomUpNoSplitConsecutiveBoundariesExpressionGraph() { super( SearchOrder.ReverseTopological ); this.arc( or( new FlowElementExpression( Boundary.class, TypeExpression.Topo.LinearOut ), new FlowElementExpression( Tap.class, TypeExpression.Topo.LinearOut ), new FlowElementExpression( Group.class, TypeExpression.Topo.LinearOut ) ), PathScopeExpression.ANY, new BoundariesElementExpression( ElementCapture.Primary ) ); }
Example #4
Source File: BoundaryBeforeSinkTapTransformer.java From cascading-flink with Apache License 2.0 | 6 votes |
public SinkTapGraph() { super(SearchOrder.ReverseTopological); arc( not( OrElementExpression.or( new FlowElementExpression(Extent.class), new FlowElementExpression(Boundary.class) ) ), ScopeExpression.ANY, new FlowElementExpression(ElementCapture.Primary, Tap.class) ); }
Example #5
Source File: JDBCScheme.java From SpyGlass with Apache License 2.0 | 6 votes |
@Override public void sourceConfInit( FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf ) { int concurrentReads = ( (JDBCTap) tap ).concurrentReads; if( selectQuery != null ) DBInputFormat.setInput( conf, TupleRecord.class, selectQuery, countQuery, limit, concurrentReads ); else { String tableName = ( (JDBCTap) tap ).getTableName(); String joinedOrderBy = orderBy != null ? Util.join( orderBy, ", " ) : null; DBInputFormat.setInput( conf, TupleRecord.class, tableName, conditions, joinedOrderBy, limit, concurrentReads, columns ); } if( inputFormatClass != null ) conf.setInputFormat( inputFormatClass ); }
Example #6
Source File: FlinkFlowStep.java From cascading-flink with Apache License 2.0 | 6 votes |
private DataSet<Tuple> translateSource(FlowProcess flowProcess, ExecutionEnvironment env, FlowNode node, int dop) { Tap tap = this.getSingle(node.getSourceTaps()); JobConf tapConfig = new JobConf(this.getNodeConfig(node)); tap.sourceConfInit(flowProcess, tapConfig); tapConfig.set( "cascading.step.source", Tap.id( tap ) ); Fields outFields = tap.getSourceFields(); registerKryoTypes(outFields); JobConf sourceConfig = new JobConf(this.getNodeConfig(node)); MultiInputFormat.addInputFormat(sourceConfig, tapConfig); DataSet<Tuple> src = env .createInput(new TapInputFormat(node), new TupleTypeInfo(outFields)) .name(tap.getIdentifier()) .setParallelism(dop) .withParameters(FlinkConfigConverter.toFlinkConfig(new Configuration(sourceConfig))); return src; }
Example #7
Source File: TapDataWriter.java From plunger with Apache License 2.0 | 6 votes |
private void writeToLocalTap(Tap<?, ?, ?> tap) throws IOException { @SuppressWarnings("unchecked") Tap<Properties, ?, ?> localTap = (Tap<Properties, ?, ?>) tap; Properties conf = new Properties(); LocalFlowProcess flowProcess = new LocalFlowProcess(conf); flowProcess.setStepStats(new LocalStepStats(new NullFlowStep(), NullClientState.INSTANCE)); localTap.sinkConfInit(flowProcess, conf); TupleEntryCollector collector = localTap.openForWrite(flowProcess); for (TupleEntry tuple : data.asTupleEntryList()) { collector.add(tuple); } collector.close(); localTap.commitResource(conf); }
Example #8
Source File: TapDataWriterTest.java From plunger with Apache License 2.0 | 6 votes |
@Test public void writeMultiSink() throws IOException { File tsvFolder1 = temporaryFolder.newFolder("data1"); File tsvFolder2 = temporaryFolder.newFolder("data2"); Tap<?, ?, ?> tap1 = new cascading.tap.hadoop.Hfs(new cascading.scheme.hadoop.TextDelimited(fields), tsvFolder1.getAbsolutePath()); Tap<?, ?, ?> tap2 = new cascading.tap.hadoop.Hfs(new cascading.scheme.hadoop.TextDelimited(valueFields), tsvFolder2.getAbsolutePath()); @SuppressWarnings("unchecked") cascading.tap.MultiSinkTap<?, ?, ?> multiTap = new cascading.tap.MultiSinkTap<>(tap1, tap2); Tap<?, ?, ?> returnedTap = new TapDataWriter(data).toTap(multiTap); assertThat(returnedTap == multiTap, is(true)); String written1 = FileUtils.readFileToString(new File(tsvFolder1, "part-00000"), Charset.forName("UTF-8")); assertThat(written1, is("X\t1\thello\nY\t2\tworld\n")); String written2 = FileUtils.readFileToString(new File(tsvFolder2, "part-00000"), Charset.forName("UTF-8")); assertThat(written2, is("1\thello\n2\tworld\n")); }
Example #9
Source File: TapDataWriterTest.java From plunger with Apache License 2.0 | 6 votes |
@Test public void writeHadoopPartition() throws IOException { File tsvFolder = temporaryFolder.newFolder("data"); cascading.tap.hadoop.PartitionTap partitionTap = new cascading.tap.hadoop.PartitionTap( new cascading.tap.hadoop.Hfs(new cascading.scheme.hadoop.TextDelimited(valueFields), tsvFolder.getAbsolutePath()), new DelimitedPartition(partitionFields)); Data data = new Data(fields, Arrays.asList(new Tuple("X", 1, "hello"), new Tuple("Y", 2, "world"))); Tap<?, ?, ?> returnedTap = new TapDataWriter(data).toTap(partitionTap); assertThat((cascading.tap.hadoop.PartitionTap) returnedTap, is(partitionTap)); File tsvFileX = new File(new File(tsvFolder, "X"), "part-00000-00000"); String writtenX = FileUtils.readFileToString(tsvFileX, Charset.forName("UTF-8")); assertThat(writtenX, is("1\thello\n")); File tsvFileY = new File(new File(tsvFolder, "Y"), "part-00000-00001"); String writtenY = FileUtils.readFileToString(tsvFileY, Charset.forName("UTF-8")); assertThat(writtenY, is("2\tworld\n")); assertThat(new File(tsvFolder, Hadoop18TapUtil.TEMPORARY_PATH).exists(), is(false)); }
Example #10
Source File: TapDataWriterTest.java From plunger with Apache License 2.0 | 6 votes |
@Test public void writeLocalPartition() throws IOException { File tsvFolder = temporaryFolder.newFolder("data"); cascading.tap.local.PartitionTap partitionTap = new cascading.tap.local.PartitionTap( new cascading.tap.local.FileTap(new cascading.scheme.local.TextDelimited(valueFields), tsvFolder.getAbsolutePath()), new DelimitedPartition(partitionFields)); Tap<?, ?, ?> returnedTap = new TapDataWriter(data).toTap(partitionTap); assertThat((cascading.tap.local.PartitionTap) returnedTap, is(partitionTap)); File tsvFileX = new File(tsvFolder, "X"); String writtenX = FileUtils.readFileToString(tsvFileX, Charset.forName("UTF-8")); assertThat(writtenX, is("1\thello\n")); File tsvFileY = new File(tsvFolder, "Y"); String writtenY = FileUtils.readFileToString(tsvFileY, Charset.forName("UTF-8")); assertThat(writtenY, is("2\tworld\n")); }
Example #11
Source File: FlinkFlowProcess.java From cascading-flink with Apache License 2.0 | 6 votes |
@Override public TupleEntryCollector openTrapForWrite(Tap trap) throws IOException { if (trap instanceof Hfs) { JobConf jobConf = new JobConf(this.getConfigCopy()); int stepNum = jobConf.getInt( "cascading.flow.step.num", 0 ); int nodeNum = jobConf.getInt( "cascading.flow.node.num", 0 ); String partname = String.format( "-%05d-%05d-%05d", stepNum, nodeNum, this.getCurrentSliceNum() ); jobConf.set( "cascading.tapcollector.partname", "%s%spart" + partname ); String value = String.format( "attempt_%012d_0000_m_%06d_0", (int) Math.rint( System.currentTimeMillis() ), this.getCurrentSliceNum() ); jobConf.set( "mapred.task.id", value ); jobConf.set( "mapreduce.task.id", value ); return trap.openForWrite( new FlinkFlowProcess( jobConf ), null); } else { throw new UnsupportedOperationException("Only Hfs taps are supported as traps"); } }
Example #12
Source File: TapTypeUtil.java From plunger with Apache License 2.0 | 6 votes |
/** Determines the type of the configuration type argument of the supplied {@link Tap}. */ static Class<?> getTapConfigClass(Tap<?, ?, ?> tap) { Class<?> currentClass = tap.getClass(); if (CompositeTap.class.isAssignableFrom(currentClass)) { currentClass = ((CompositeTap<?>) tap).getChildTaps().next().getClass(); } while (currentClass != null) { if (Tap.class.isAssignableFrom(currentClass)) { Type genericSuperclass = currentClass.getGenericSuperclass(); if (genericSuperclass instanceof ParameterizedType) { ParameterizedType tapType = (ParameterizedType) genericSuperclass; Type[] typeParameters = tapType.getActualTypeArguments(); Type configTypeParameter = typeParameters[0]; if (configTypeParameter instanceof Class) { Class<?> configClassParameter = (Class<?>) configTypeParameter; return configClassParameter; } } } currentClass = currentClass.getSuperclass(); } return null; }
Example #13
Source File: CommonCrawlIndexTest.java From aws-big-data-blog with Apache License 2.0 | 6 votes |
@Test public void testCreateCommonCrawlFlowDef() throws Exception { Properties properties = new ConfigReader().renderProperties(CommonCrawlIndexTest.class); String sourcePath = properties.getProperty("inPath"); String sinkPath = properties.getProperty("testCreateCommonCrawlFlowDefOutput"); String sinkValidationPath = properties.getProperty("testCreateCommonCrawlFlowDefOutputValidation"); // create the Cascading "source" (input) tap to read the commonCrawl WAT file(s) Tap source = new FileTap(new TextLine(new Fields("line")) ,sourcePath); // create the Cascading "sink" (output) tap to dump the results Tap sink = new FileTap(new TextLine(new Fields("line")) ,sinkPath); //Build the Cascading Flow Definition FlowDef flowDef = CommonCrawlIndex.createCommonCrawlFlowDef(source, sink); new LocalFlowConnector(properties).connect(flowDef).complete(); Assert.sameContent(sinkPath, sinkValidationPath); }
Example #14
Source File: JDBCScheme.java From SpyGlass with Apache License 2.0 | 5 votes |
@Override public void sinkConfInit( FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf ) { if( selectQuery != null ) throw new TapException( "cannot sink to this Scheme" ); String tableName = ( (JDBCTap) tap ).getTableName(); int batchSize = ( (JDBCTap) tap ).getBatchSize(); DBOutputFormat.setOutput( conf, DBOutputFormat.class, tableName, columns, updateBy, batchSize ); if( outputFormatClass != null ) conf.setOutputFormat( outputFormatClass ); }
Example #15
Source File: TapDataWriterTest.java From plunger with Apache License 2.0 | 5 votes |
@Test public void writeHfs() throws IOException { File tsvFolder = temporaryFolder.newFolder("data"); cascading.tap.hadoop.Hfs hfsTap = new cascading.tap.hadoop.Hfs(new cascading.scheme.hadoop.TextDelimited(), tsvFolder.getAbsolutePath()); Tap<?, ?, ?> returnedTap = new TapDataWriter(data).toTap(hfsTap); assertThat((cascading.tap.hadoop.Hfs) returnedTap, is(hfsTap)); String written = FileUtils.readFileToString(new File(tsvFolder, "part-00000"), Charset.forName("UTF-8")); assertThat(written, is("X\t1\thello\nY\t2\tworld\n")); assertThat(new File(tsvFolder, Hadoop18TapUtil.TEMPORARY_PATH).exists(), is(false)); }
Example #16
Source File: HBaseScheme.java From SpyGlass with Apache License 2.0 | 5 votes |
@Override public void sinkConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputFormat(HBaseOutputFormat.class); conf.setOutputKeyClass(ImmutableBytesWritable.class); conf.setOutputValueClass(Put.class); String tableName = conf.get(HBaseOutputFormat.OUTPUT_TABLE); useSalt = conf.getBoolean(String.format(HBaseConstants.USE_SALT, tableName), false); }
Example #17
Source File: HBaseRawScheme.java From SpyGlass with Apache License 2.0 | 5 votes |
@Override public void sourceConfInit(FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { DeprecatedInputFormatWrapper.setInputFormat(org.apache.hadoop.hbase.mapreduce.TableInputFormat.class, conf, ValueCopier.class); if (null != familyNames) { String columns = Util.join(this.familyNames, " "); LOG.debug("sourcing from column families: {}", columns); conf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.SCAN_COLUMNS, columns); } }
Example #18
Source File: TapDataReader.java From plunger with Apache License 2.0 | 5 votes |
private TupleEntryIterator getHadoopTupleEntryIterator() throws IOException { @SuppressWarnings("unchecked") Tap<JobConf, ?, ?> hadoopTap = (Tap<JobConf, ?, ?>) source; JobConf conf = new JobConf(); FlowProcess<JobConf> flowProcess = new HadoopFlowProcess(conf); hadoopTap.sourceConfInit(flowProcess, conf); return hadoopTap.openForRead(flowProcess); }
Example #19
Source File: JDBCTapCollector.java From SpyGlass with Apache License 2.0 | 5 votes |
/** * Constructor TapCollector creates a new TapCollector instance. * * @param flowProcess * @param tap of type Tap * @throws IOException when fails to initialize */ public JDBCTapCollector( FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap ) throws IOException { super( flowProcess, tap.getScheme() ); this.hadoopFlowProcess = flowProcess; this.tap = tap; this.conf = new JobConf( flowProcess.getConfigCopy() ); this.setOutput( this ); }
Example #20
Source File: TapDataReader.java From plunger with Apache License 2.0 | 5 votes |
private TupleEntryIterator getLocalTupleEntryIterator() throws IOException { @SuppressWarnings("unchecked") Tap<Properties, ?, ?> localTap = (Tap<Properties, ?, ?>) source; Properties properties = new Properties(); FlowProcess<Properties> flowProcess = new LocalFlowProcess(properties); localTap.sourceConfInit(flowProcess, properties); return localTap.openForRead(flowProcess); }
Example #21
Source File: TapDataWriter.java From plunger with Apache License 2.0 | 5 votes |
private void writeToHadoopTap(Tap<?, ?, ?> tap) throws IOException { @SuppressWarnings("unchecked") Tap<JobConf, ?, ?> hadoopTap = (Tap<JobConf, ?, ?>) tap; JobConf conf = new JobConf(); HadoopFlowProcess flowProcess = new HadoopFlowProcess(conf); hadoopTap.sinkConfInit(flowProcess, conf); TupleEntryCollector collector = hadoopTap.openForWrite(flowProcess); for (TupleEntry tuple : data.asTupleEntryList()) { collector.add(tuple); } collector.close(); }
Example #22
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@SuppressWarnings("rawtypes") @Override public void sinkConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema); ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class); }
Example #23
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public Fields retrieveSourceFields(FlowProcess<? extends JobConf> flowProcess, Tap tap) { MessageType schema = readSchema(flowProcess, tap); SchemaIntersection intersection = new SchemaIntersection(schema, getSourceFields()); setSourceFields(intersection.getSourceFields()); return getSourceFields(); }
Example #24
Source File: WordCount.java From cascading-flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) { if (args.length < 2) { throw new IllegalArgumentException("Please specify input and ouput paths as arguments."); } Fields token = new Fields( "token", String.class ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "\\s+" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new AggregateBy( wcPipe, token, new CountBy(new Fields("count"))); Tap inTap = new Hfs(new TextDelimited(text, "\n" ), args[0]); Tap outTap = new Hfs(new TextDelimited(false, "\n"), args[1], SinkMode.REPLACE); FlowDef flowDef = FlowDef.flowDef().setName( "wc" ) .addSource( docPipe, inTap ) .addTailSink( wcPipe, outTap ); FlowConnector flowConnector = new FlinkConnector(); Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.complete(); }
Example #25
Source File: ParquetValueScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sourceConfInit(FlowProcess<? extends JobConf> jobConfFlowProcess, Tap<JobConf, RecordReader, OutputCollector> jobConfRecordReaderOutputCollectorTap, JobConf jobConf) { setPredicatePushdown(jobConf); setProjectionPushdown(jobConf); setStrictProjectionPushdown(jobConf); setRecordClass(jobConf); }
Example #26
Source File: ParquetTBaseScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sinkConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { if (this.config.getKlass() == null) { throw new IllegalArgumentException("To use ParquetTBaseScheme as a sink, you must specify a thrift class in the constructor"); } DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, TBaseWriteSupport.class); TBaseWriteSupport.<T>setThriftClass(jobConf, this.config.getKlass()); }
Example #27
Source File: ParquetTBaseScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { super.sourceConfInit(fp, tap, jobConf); jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class); ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class); }
Example #28
Source File: ParquetScroogeScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sourceConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { super.sourceConfInit(fp, tap, jobConf); jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, ScroogeReadSupport.class); ThriftReadSupport.setRecordConverterClass(jobConf, ScroogeRecordConverter.class); }
Example #29
Source File: ParquetScroogeScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sinkConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class); ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass()); }
Example #30
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@SuppressWarnings("rawtypes") @Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { if (filterPredicate != null) { ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate); } jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class); TupleReadSupport.setRequestedFields(jobConf, getSourceFields()); }