Java Code Examples for org.apache.tez.dag.api.Vertex#addDataSink()
The following examples show how to use
org.apache.tez.dag.api.Vertex#addDataSink() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TopKDataGen.java From sequenceiq-samples with Apache License 2.0 | 6 votes |
private DAG createDag(TezConfiguration tezConf, Path outPath, long outSize, int extraColumns, int numTasks) throws IOException { long largeOutSizePerTask = outSize / numTasks; DAG dag = DAG.create("TopK DataGen"); Vertex genDataVertex = Vertex.create("datagen", ProcessorDescriptor.create( GenDataProcessor.class.getName()).setUserPayload( UserPayload.create(ByteBuffer.wrap(GenDataProcessor.createConfiguration(largeOutSizePerTask, extraColumns)))), numTasks); genDataVertex.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outPath.toUri().toString()).build()); dag.addVertex(genDataVertex); return dag; }
Example 2
Source File: TestMockDAGAppMaster.java From tez with Apache License 2.0 | 6 votes |
private DAG createDAG(String dagName, boolean uv12CommitFail, boolean v3CommitFail) { DAG dag = DAG.create(dagName); Vertex v1 = Vertex.create("v1", ProcessorDescriptor.create("Proc"), 1); Vertex v2 = Vertex.create("v2", ProcessorDescriptor.create("Proc"), 1); Vertex v3 = Vertex.create("v3", ProcessorDescriptor.create("Proc"), 1); VertexGroup uv12 = dag.createVertexGroup("uv12", v1, v2); DataSinkDescriptor uv12DataSink = DataSinkDescriptor.create( OutputDescriptor.create("dummy output"), createOutputCommitterDesc(uv12CommitFail), null); uv12.addDataSink("uv12Out", uv12DataSink); DataSinkDescriptor v3DataSink = DataSinkDescriptor.create( OutputDescriptor.create("dummy output"), createOutputCommitterDesc(v3CommitFail), null); v3.addDataSink("v3Out", v3DataSink); GroupInputEdge e1 = GroupInputEdge.create(uv12, v3, EdgeProperty.create( DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("dummy output class"), InputDescriptor.create("dummy input class")), InputDescriptor .create("merge.class")); dag.addVertex(v1) .addVertex(v2) .addVertex(v3) .addEdge(e1); return dag; }
Example 3
Source File: JoinDataGen.java From tez with Apache License 2.0 | 5 votes |
private DAG createDag(TezConfiguration tezConf, Path largeOutPath, Path smallOutPath, Path expectedOutputPath, int numTasks, long largeOutSize, long smallOutSize) throws IOException { long largeOutSizePerTask = largeOutSize / numTasks; long smallOutSizePerTask = smallOutSize / numTasks; DAG dag = DAG.create("JoinDataGen"); Vertex genDataVertex = Vertex.create("datagen", ProcessorDescriptor.create( GenDataProcessor.class.getName()).setUserPayload( UserPayload.create(ByteBuffer.wrap(GenDataProcessor.createConfiguration(largeOutSizePerTask, smallOutSizePerTask)))), numTasks); genDataVertex.addDataSink(STREAM_OUTPUT_NAME, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, largeOutPath.toUri().toString()).build()); genDataVertex.addDataSink(HASH_OUTPUT_NAME, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, smallOutPath.toUri().toString()).build()); genDataVertex.addDataSink(EXPECTED_OUTPUT_NAME, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, expectedOutputPath.toUri().toString()).build()); dag.addVertex(genDataVertex); return dag; }
Example 4
Source File: CartesianProduct.java From tez with Apache License 2.0 | 5 votes |
private DAG createDAG(TezConfiguration tezConf) throws IOException { InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName()); InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName()); DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null); Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName())); v1.addDataSource(INPUT, dataSourceDescriptor); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, dataSourceDescriptor); OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName()); OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName()); DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null); CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices)); UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName())); v3.addDataSink(OUTPUT, dataSinkDescriptor); v3.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()) .setUserPayload(userPayload)); EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()); edgeManagerDescriptor.setUserPayload(userPayload); UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build(); EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor); return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3) .addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty)); }
Example 5
Source File: CartesianProduct.java From tez with Apache License 2.0 | 4 votes |
private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2, String inputPath3, String outputPath, boolean isPartitioned) throws IOException { Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName())); // turn off groupSplit so that each input file incurs one task v1.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1) .groupSplits(false).build()); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2) .groupSplits(false).build()); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName())); v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3) .groupSplits(false).build()); CartesianProductConfig cartesianProductConfig; if (isPartitioned) { Map<String, Integer> vertexPartitionMap = new HashMap<>(); for (String vertex : cpSources) { vertexPartitionMap.put(vertex, numPartition); } cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap); } else { cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources)); } UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf); Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName())); v4.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath) .build()); v4.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()) .setUserPayload(userPayload)); EdgeManagerPluginDescriptor cpEdgeManager = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()); cpEdgeManager.setUserPayload(userPayload); EdgeProperty cpEdgeProperty; if (isPartitioned) { UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), CustomPartitioner.class.getName()).build(); cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } else { UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } EdgeProperty broadcastEdgeProperty; UnorderedKVEdgeConfig broadcastEdgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty(); return DAG.create("CartesianProduct") .addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4) .addEdge(Edge.create(v1, v4, cpEdgeProperty)) .addEdge(Edge.create(v2, v4, cpEdgeProperty)) .addEdge(Edge.create(v3, v4, broadcastEdgeProperty)); }
Example 6
Source File: OrderedWordCount.java From tez with Apache License 2.0 | 4 votes |
public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException { DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping) .generateSplitsInAM(!isGenerateSplitInClient).build(); DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build(); Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create( TokenProcessor.class.getName())); tokenizerVertex.addDataSource(INPUT, dataSource); // Use Text key and IntWritable value to bring counts for each word in the same partition // The setFromConfiguration call is optional and allows overriding the config options with // command line parameters. OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()) .setFromConfiguration(tezConf) .build(); // This vertex will be reading intermediate data via an input edge and writing intermediate data // via an output edge. Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create( SumProcessor.class.getName()), numPartitions); // Use IntWritable key and Text value to bring all words with the same count in the same // partition. The data will be ordered by count and words grouped by count. The // setFromConfiguration call is optional and allows overriding the config options with // command line parameters. OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()) .setFromConfiguration(tezConf) .build(); // Use 1 task to bring all the data in one place for global sorted order. Essentially the number // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create( NoOpSorter.class.getName()), 1); sorterVertex.addDataSink(OUTPUT, dataSink); // No need to add jar containing this class as assumed to be part of the tez jars. DAG dag = DAG.create(dagName); dag.addVertex(tokenizerVertex) .addVertex(summationVertex) .addVertex(sorterVertex) .addEdge( Edge.create(tokenizerVertex, summationVertex, summationEdgeConf.createDefaultEdgeProperty())) .addEdge( Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty())); return dag; }
Example 7
Source File: UnionExample.java From tez with Apache License 2.0 | 4 votes |
private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException { DAG dag = DAG.create("UnionExample"); int numMaps = -1; Configuration inputConf = new Configuration(tezConf); inputConf.setBoolean("mapred.mapper.new-api", false); inputConf.set("mapred.input.format.class", TextInputFormat.class.getName()); inputConf.set(FileInputFormat.INPUT_DIR, inputPath); MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null); DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build(); Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create( TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource); Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create( TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource); Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create( TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource); Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create( UnionProcessor.class.getName()), 1); Configuration outputConf = new Configuration(tezConf); outputConf.setBoolean("mapred.reducer.new-api", false); outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName()); outputConf.set(FileOutputFormat.OUTDIR, outputPath); DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build(); checkerVertex.addDataSink("union", od); Configuration allPartsConf = new Configuration(tezConf); DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build(); checkerVertex.addDataSink("all-parts", od2); Configuration partsConf = new Configuration(tezConf); DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build(); VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2); unionVertex.addDataSink("parts", od1); OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build(); dag.addVertex(mapVertex1) .addVertex(mapVertex2) .addVertex(mapVertex3) .addVertex(checkerVertex) .addEdge( Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())) .addEdge( GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create( ConcatenatedMergedKeyValuesInput.class.getName()))); return dag; }
Example 8
Source File: YARNRunner.java From tez with Apache License 2.0 | 4 votes |
private Vertex createVertexForStage(Configuration stageConf, Map<String, LocalResource> jobLocalResources, List<TaskLocationHint> locations, int stageNum, int totalStages) throws IOException { // stageNum starts from 0, goes till numStages - 1 boolean isMap = false; if (stageNum == 0) { isMap = true; } int numTasks = isMap ? stageConf.getInt(MRJobConfig.NUM_MAPS, 0) : stageConf.getInt(MRJobConfig.NUM_REDUCES, 0); String processorName = isMap ? MapProcessor.class.getName() : ReduceProcessor.class.getName(); String vertexName = null; if (isMap) { vertexName = MultiStageMRConfigUtil.getInitialMapVertexName(); } else { if (stageNum == totalStages - 1) { vertexName = MultiStageMRConfigUtil.getFinalReduceVertexName(); } else { vertexName = MultiStageMRConfigUtil .getIntermediateStageVertexName(stageNum); } } Resource taskResource = isMap ? MRHelpers.getResourceForMRMapper(stageConf) : MRHelpers.getResourceForMRReducer(stageConf); stageConf.set(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX, "part"); UserPayload vertexUserPayload = TezUtils.createUserPayloadFromConf(stageConf); Vertex vertex = Vertex.create(vertexName, ProcessorDescriptor.create(processorName).setUserPayload(vertexUserPayload), numTasks, taskResource); if (stageConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) { vertex.getProcessorDescriptor().setHistoryText(TezUtils.convertToHistoryText(stageConf)); } if (isMap) { vertex.addDataSource("MRInput", configureMRInputWithLegacySplitsGenerated(stageConf, true)); } // Map only jobs. if (stageNum == totalStages -1) { OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()) .setUserPayload(vertexUserPayload); if (stageConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) { od.setHistoryText(TezUtils.convertToHistoryText(stageConf)); } vertex.addDataSink("MROutput", DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null)); } Map<String, String> taskEnv = new HashMap<String, String>(); setupMapReduceEnv(stageConf, taskEnv, isMap); Map<String, LocalResource> taskLocalResources = new TreeMap<String, LocalResource>(); // PRECOMMIT Remove split localization for reduce tasks if it's being set // here taskLocalResources.putAll(jobLocalResources); String taskJavaOpts = isMap ? MRHelpers.getJavaOptsForMRMapper(stageConf) : MRHelpers.getJavaOptsForMRReducer(stageConf); vertex.setTaskEnvironment(taskEnv) .addTaskLocalFiles(taskLocalResources) .setLocationHint(VertexLocationHint.create(locations)) .setTaskLaunchCmdOpts(taskJavaOpts); if (!isMap) { vertex.setVertexManagerPlugin((ShuffleVertexManager.createConfigBuilder(stageConf).build())); } if (LOG.isDebugEnabled()) { LOG.debug("Adding vertex to DAG" + ", vertexName=" + vertex.getName() + ", processor=" + vertex.getProcessorDescriptor().getClassName() + ", parallelism=" + vertex.getParallelism() + ", javaOpts=" + vertex.getTaskLaunchCmdOpts() + ", resources=" + vertex.getTaskResource() // TODO Add localResources and Environment ); } return vertex; }
Example 9
Source File: TestMockDAGAppMaster.java From tez with Apache License 2.0 | 4 votes |
@Test (timeout = 10000) public void testBasicStatistics() throws Exception { TezConfiguration tezconf = new TezConfiguration(defaultConf); MockTezClient tezClient = new MockTezClient("testMockAM", tezconf, true, null, null, null, null, false, false); tezClient.start(); final String vAName = "A"; final String vBName = "B"; final String sourceName = "In"; final String sinkName = "Out"; DAG dag = DAG.create("testBasisStatistics"); Vertex vA = Vertex.create(vAName, ProcessorDescriptor.create("Proc.class"), 3); Vertex vB = Vertex.create(vBName, ProcessorDescriptor.create("Proc.class"), 2); vA.addDataSource(sourceName, DataSourceDescriptor.create(InputDescriptor.create("In"), null, null)); vB.addDataSink(sinkName, DataSinkDescriptor.create(OutputDescriptor.create("Out"), null, null)); dag.addVertex(vA) .addVertex(vB) .addEdge( Edge.create(vA, vB, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out"), InputDescriptor.create("In")))); IOStatistics ioStats = new IOStatistics(); ioStats.setDataSize(1); ioStats.setItemsProcessed(1); TaskStatistics vAStats = new TaskStatistics(); vAStats.addIO(vBName, ioStats); vAStats.addIO(sourceName, ioStats); TaskStatistics vBStats = new TaskStatistics(); vBStats.addIO(vAName, ioStats); vBStats.addIO(sinkName, ioStats); ByteArrayOutputStream bosA = new ByteArrayOutputStream(); DataOutput outA = new DataOutputStream(bosA); vAStats.write(outA); final byte[] payloadA = bosA.toByteArray(); ByteArrayOutputStream bosB = new ByteArrayOutputStream(); DataOutput outB = new DataOutputStream(bosB); vBStats.write(outB); final byte[] payloadB = bosB.toByteArray(); MockDAGAppMaster mockApp = tezClient.getLocalClient().getMockApp(); MockContainerLauncher mockLauncher = mockApp.getContainerLauncher(); mockLauncher.startScheduling(false); mockApp.statsDelegate = new StatisticsDelegate() { @Override public TaskStatistics getStatistics(TaskSpec taskSpec) { byte[] payload = payloadA; TaskStatistics stats = new TaskStatistics(); if (taskSpec.getVertexName().equals(vBName)) { payload = payloadB; } final DataInputByteBuffer in = new DataInputByteBuffer(); in.reset(ByteBuffer.wrap(payload)); try { // this ensures that the serde code path is covered. stats.readFields(in); } catch (IOException e) { Assert.fail(e.getMessage()); } return stats; } }; mockApp.doSleep = false; DAGClient dagClient = tezClient.submitDAG(dag); mockLauncher.waitTillContainersLaunched(); DAGImpl dagImpl = (DAGImpl) mockApp.getContext().getCurrentDAG(); mockLauncher.startScheduling(true); DAGStatus status = dagClient.waitForCompletion(); Assert.assertEquals(DAGStatus.State.SUCCEEDED, status.getState()); // verify that the values have been correct aggregated for (org.apache.tez.dag.app.dag.Vertex v : dagImpl.getVertices().values()) { VertexStatistics vStats = v.getStatistics(); if (v.getName().equals(vAName)) { Assert.assertEquals(3, vStats.getOutputStatistics(vBName).getDataSize()); Assert.assertEquals(3, vStats.getInputStatistics(sourceName).getDataSize()); Assert.assertEquals(3, vStats.getOutputStatistics(vBName).getItemsProcessed()); Assert.assertEquals(3, vStats.getInputStatistics(sourceName).getItemsProcessed()); } else { Assert.assertEquals(2, vStats.getInputStatistics(vAName).getDataSize()); Assert.assertEquals(2, vStats.getOutputStatistics(sinkName).getDataSize()); Assert.assertEquals(2, vStats.getInputStatistics(vAName).getItemsProcessed()); Assert.assertEquals(2, vStats.getOutputStatistics(sinkName).getItemsProcessed()); } } tezClient.stop(); }