Java Code Examples for org.apache.tez.dag.api.Vertex#addDataSource()
The following examples show how to use
org.apache.tez.dag.api.Vertex#addDataSource() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CartesianProduct.java From tez with Apache License 2.0 | 5 votes |
private DAG createDAG(TezConfiguration tezConf) throws IOException { InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName()); InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName()); DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null); Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName())); v1.addDataSource(INPUT, dataSourceDescriptor); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, dataSourceDescriptor); OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName()); OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName()); DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null); CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices)); UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName())); v3.addDataSink(OUTPUT, dataSinkDescriptor); v3.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()) .setUserPayload(userPayload)); EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()); edgeManagerDescriptor.setUserPayload(userPayload); UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build(); EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor); return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3) .addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty)); }
Example 2
Source File: TestExceptionPropagation.java From tez with Apache License 2.0 | 5 votes |
/** * create a DAG with 2 vertices (v1 --> v2), set payload on Input/Output/Processor/VertexManagerPlugin to * control where throw exception * * @param exLocation * @return * @throws IOException */ private DAG createDAG(ExceptionLocation exLocation) throws IOException { DAG dag = DAG.create("dag_" + exLocation.name()); UserPayload payload = UserPayload.create(ByteBuffer.wrap(exLocation.name().getBytes())); Vertex v1 = Vertex.create("v1", ProcessorWithException.getProcDesc(payload), 1); InputDescriptor inputDesc = InputWithException.getInputDesc(payload); InputInitializerDescriptor iiDesc = InputInitializerWithException.getIIDesc(payload); v1.addDataSource("input", DataSourceDescriptor.create(inputDesc, iiDesc, null)); v1.setVertexManagerPlugin(RootInputVertexManagerWithException .getVMDesc(exLocation)); Vertex v2 = Vertex.create("v2", DoNothingProcessor.getProcDesc(), 1); v2.addDataSource("input2", DataSourceDescriptor.create(InputDescriptor.create(NoOpInput.class.getName()), InputInitializerWithException2.getIIDesc(payload), null)); dag.addVertex(v1) .addVertex(v2); if (exLocation.name().startsWith("EM_")) { dag.addEdge(Edge.create(v1, v2, EdgeProperty.create( EdgeManagerPluginDescriptor.create(CustomEdgeManager.class.getName()) .setUserPayload(payload), DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputWithException.getOutputDesc(payload), InputWithException.getInputDesc(payload)))); } else { // set Customized VertexManager here, it can't been used for CustomEdge v2.setVertexManagerPlugin(InputReadyVertexManagerWithException.getVMDesc(exLocation)); dag.addEdge(Edge.create(v1, v2, EdgeProperty.create(DataMovementType.ONE_TO_ONE, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputWithException.getOutputDesc(payload), InputWithException.getInputDesc(payload)))); } return dag; }
Example 3
Source File: TestMemoryWithEvents.java From tez with Apache License 2.0 | 5 votes |
@Ignore @Test (timeout = 600000) public void testMemoryRootInputEvents() throws Exception { DAG dag = DAG.create("testMemoryRootInputEvents"); Vertex vA = Vertex.create("A", ProcessorDescriptor.create("Proc.class"), numTasks); Vertex vB = Vertex.create("B", ProcessorDescriptor.create("Proc.class"), numTasks); vA.addDataSource( "Input", DataSourceDescriptor.create(InputDescriptor.create("In"), InputInitializerDescriptor.create(SimulationInitializer.class.getName()), null)); dag.addVertex(vA).addVertex(vB); testMemory(dag, false); }
Example 4
Source File: CartesianProduct.java From tez with Apache License 2.0 | 4 votes |
private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2, String inputPath3, String outputPath, boolean isPartitioned) throws IOException { Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName())); // turn off groupSplit so that each input file incurs one task v1.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1) .groupSplits(false).build()); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2) .groupSplits(false).build()); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName())); v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3) .groupSplits(false).build()); CartesianProductConfig cartesianProductConfig; if (isPartitioned) { Map<String, Integer> vertexPartitionMap = new HashMap<>(); for (String vertex : cpSources) { vertexPartitionMap.put(vertex, numPartition); } cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap); } else { cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources)); } UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf); Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName())); v4.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath) .build()); v4.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()) .setUserPayload(userPayload)); EdgeManagerPluginDescriptor cpEdgeManager = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()); cpEdgeManager.setUserPayload(userPayload); EdgeProperty cpEdgeProperty; if (isPartitioned) { UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), CustomPartitioner.class.getName()).build(); cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } else { UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } EdgeProperty broadcastEdgeProperty; UnorderedKVEdgeConfig broadcastEdgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty(); return DAG.create("CartesianProduct") .addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4) .addEdge(Edge.create(v1, v4, cpEdgeProperty)) .addEdge(Edge.create(v2, v4, cpEdgeProperty)) .addEdge(Edge.create(v3, v4, broadcastEdgeProperty)); }
Example 5
Source File: OrderedWordCount.java From tez with Apache License 2.0 | 4 votes |
public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath, int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException { DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping) .generateSplitsInAM(!isGenerateSplitInClient).build(); DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath).build(); Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create( TokenProcessor.class.getName())); tokenizerVertex.addDataSource(INPUT, dataSource); // Use Text key and IntWritable value to bring counts for each word in the same partition // The setFromConfiguration call is optional and allows overriding the config options with // command line parameters. OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()) .setFromConfiguration(tezConf) .build(); // This vertex will be reading intermediate data via an input edge and writing intermediate data // via an output edge. Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create( SumProcessor.class.getName()), numPartitions); // Use IntWritable key and Text value to bring all words with the same count in the same // partition. The data will be ordered by count and words grouped by count. The // setFromConfiguration call is optional and allows overriding the config options with // command line parameters. OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(IntWritable.class.getName(), Text.class.getName(), HashPartitioner.class.getName()) .setFromConfiguration(tezConf) .build(); // Use 1 task to bring all the data in one place for global sorted order. Essentially the number // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create( NoOpSorter.class.getName()), 1); sorterVertex.addDataSink(OUTPUT, dataSink); // No need to add jar containing this class as assumed to be part of the tez jars. DAG dag = DAG.create(dagName); dag.addVertex(tokenizerVertex) .addVertex(summationVertex) .addVertex(sorterVertex) .addEdge( Edge.create(tokenizerVertex, summationVertex, summationEdgeConf.createDefaultEdgeProperty())) .addEdge( Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty())); return dag; }
Example 6
Source File: YARNRunner.java From tez with Apache License 2.0 | 4 votes |
private Vertex createVertexForStage(Configuration stageConf, Map<String, LocalResource> jobLocalResources, List<TaskLocationHint> locations, int stageNum, int totalStages) throws IOException { // stageNum starts from 0, goes till numStages - 1 boolean isMap = false; if (stageNum == 0) { isMap = true; } int numTasks = isMap ? stageConf.getInt(MRJobConfig.NUM_MAPS, 0) : stageConf.getInt(MRJobConfig.NUM_REDUCES, 0); String processorName = isMap ? MapProcessor.class.getName() : ReduceProcessor.class.getName(); String vertexName = null; if (isMap) { vertexName = MultiStageMRConfigUtil.getInitialMapVertexName(); } else { if (stageNum == totalStages - 1) { vertexName = MultiStageMRConfigUtil.getFinalReduceVertexName(); } else { vertexName = MultiStageMRConfigUtil .getIntermediateStageVertexName(stageNum); } } Resource taskResource = isMap ? MRHelpers.getResourceForMRMapper(stageConf) : MRHelpers.getResourceForMRReducer(stageConf); stageConf.set(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX, "part"); UserPayload vertexUserPayload = TezUtils.createUserPayloadFromConf(stageConf); Vertex vertex = Vertex.create(vertexName, ProcessorDescriptor.create(processorName).setUserPayload(vertexUserPayload), numTasks, taskResource); if (stageConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) { vertex.getProcessorDescriptor().setHistoryText(TezUtils.convertToHistoryText(stageConf)); } if (isMap) { vertex.addDataSource("MRInput", configureMRInputWithLegacySplitsGenerated(stageConf, true)); } // Map only jobs. if (stageNum == totalStages -1) { OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()) .setUserPayload(vertexUserPayload); if (stageConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) { od.setHistoryText(TezUtils.convertToHistoryText(stageConf)); } vertex.addDataSink("MROutput", DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null)); } Map<String, String> taskEnv = new HashMap<String, String>(); setupMapReduceEnv(stageConf, taskEnv, isMap); Map<String, LocalResource> taskLocalResources = new TreeMap<String, LocalResource>(); // PRECOMMIT Remove split localization for reduce tasks if it's being set // here taskLocalResources.putAll(jobLocalResources); String taskJavaOpts = isMap ? MRHelpers.getJavaOptsForMRMapper(stageConf) : MRHelpers.getJavaOptsForMRReducer(stageConf); vertex.setTaskEnvironment(taskEnv) .addTaskLocalFiles(taskLocalResources) .setLocationHint(VertexLocationHint.create(locations)) .setTaskLaunchCmdOpts(taskJavaOpts); if (!isMap) { vertex.setVertexManagerPlugin((ShuffleVertexManager.createConfigBuilder(stageConf).build())); } if (LOG.isDebugEnabled()) { LOG.debug("Adding vertex to DAG" + ", vertexName=" + vertex.getName() + ", processor=" + vertex.getProcessorDescriptor().getClassName() + ", parallelism=" + vertex.getParallelism() + ", javaOpts=" + vertex.getTaskLaunchCmdOpts() + ", resources=" + vertex.getTaskResource() // TODO Add localResources and Environment ); } return vertex; }
Example 7
Source File: TestMockDAGAppMaster.java From tez with Apache License 2.0 | 4 votes |
@Test (timeout = 10000) public void testBasicStatistics() throws Exception { TezConfiguration tezconf = new TezConfiguration(defaultConf); MockTezClient tezClient = new MockTezClient("testMockAM", tezconf, true, null, null, null, null, false, false); tezClient.start(); final String vAName = "A"; final String vBName = "B"; final String sourceName = "In"; final String sinkName = "Out"; DAG dag = DAG.create("testBasisStatistics"); Vertex vA = Vertex.create(vAName, ProcessorDescriptor.create("Proc.class"), 3); Vertex vB = Vertex.create(vBName, ProcessorDescriptor.create("Proc.class"), 2); vA.addDataSource(sourceName, DataSourceDescriptor.create(InputDescriptor.create("In"), null, null)); vB.addDataSink(sinkName, DataSinkDescriptor.create(OutputDescriptor.create("Out"), null, null)); dag.addVertex(vA) .addVertex(vB) .addEdge( Edge.create(vA, vB, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("Out"), InputDescriptor.create("In")))); IOStatistics ioStats = new IOStatistics(); ioStats.setDataSize(1); ioStats.setItemsProcessed(1); TaskStatistics vAStats = new TaskStatistics(); vAStats.addIO(vBName, ioStats); vAStats.addIO(sourceName, ioStats); TaskStatistics vBStats = new TaskStatistics(); vBStats.addIO(vAName, ioStats); vBStats.addIO(sinkName, ioStats); ByteArrayOutputStream bosA = new ByteArrayOutputStream(); DataOutput outA = new DataOutputStream(bosA); vAStats.write(outA); final byte[] payloadA = bosA.toByteArray(); ByteArrayOutputStream bosB = new ByteArrayOutputStream(); DataOutput outB = new DataOutputStream(bosB); vBStats.write(outB); final byte[] payloadB = bosB.toByteArray(); MockDAGAppMaster mockApp = tezClient.getLocalClient().getMockApp(); MockContainerLauncher mockLauncher = mockApp.getContainerLauncher(); mockLauncher.startScheduling(false); mockApp.statsDelegate = new StatisticsDelegate() { @Override public TaskStatistics getStatistics(TaskSpec taskSpec) { byte[] payload = payloadA; TaskStatistics stats = new TaskStatistics(); if (taskSpec.getVertexName().equals(vBName)) { payload = payloadB; } final DataInputByteBuffer in = new DataInputByteBuffer(); in.reset(ByteBuffer.wrap(payload)); try { // this ensures that the serde code path is covered. stats.readFields(in); } catch (IOException e) { Assert.fail(e.getMessage()); } return stats; } }; mockApp.doSleep = false; DAGClient dagClient = tezClient.submitDAG(dag); mockLauncher.waitTillContainersLaunched(); DAGImpl dagImpl = (DAGImpl) mockApp.getContext().getCurrentDAG(); mockLauncher.startScheduling(true); DAGStatus status = dagClient.waitForCompletion(); Assert.assertEquals(DAGStatus.State.SUCCEEDED, status.getState()); // verify that the values have been correct aggregated for (org.apache.tez.dag.app.dag.Vertex v : dagImpl.getVertices().values()) { VertexStatistics vStats = v.getStatistics(); if (v.getName().equals(vAName)) { Assert.assertEquals(3, vStats.getOutputStatistics(vBName).getDataSize()); Assert.assertEquals(3, vStats.getInputStatistics(sourceName).getDataSize()); Assert.assertEquals(3, vStats.getOutputStatistics(vBName).getItemsProcessed()); Assert.assertEquals(3, vStats.getInputStatistics(sourceName).getItemsProcessed()); } else { Assert.assertEquals(2, vStats.getInputStatistics(vAName).getDataSize()); Assert.assertEquals(2, vStats.getOutputStatistics(sinkName).getDataSize()); Assert.assertEquals(2, vStats.getInputStatistics(vAName).getItemsProcessed()); Assert.assertEquals(2, vStats.getOutputStatistics(sinkName).getItemsProcessed()); } } tezClient.stop(); }
Example 8
Source File: TestMockDAGAppMaster.java From tez with Apache License 2.0 | 4 votes |
@Ignore @Test (timeout = 60000) public void testBasicStatisticsMemory() throws Exception { Logger.getRootLogger().setLevel(Level.WARN); TezConfiguration tezconf = new TezConfiguration(defaultConf); MockTezClient tezClient = new MockTezClient("testMockAM", tezconf, true, null, null, null, null, false, false); tezClient.start(); final String vAName = "abcdefghijklmnopqrstuvwxyz"; int numTasks = 10000; int numSources = 10; IOStatistics ioStats = new IOStatistics(); ioStats.setDataSize(1); ioStats.setItemsProcessed(1); TaskStatistics vAStats = new TaskStatistics(); DAG dag = DAG.create("testBasicStatisticsMemory"); Vertex vA = Vertex.create(vAName, ProcessorDescriptor.create("Proc.class"), numTasks); for (int i=0; i<numSources; ++i) { final String sourceName = i + vAName; vA.addDataSource(sourceName, DataSourceDescriptor.create(InputDescriptor.create(sourceName), null, null)); vAStats.addIO(sourceName, ioStats); } dag.addVertex(vA); ByteArrayOutputStream bosA = new ByteArrayOutputStream(); DataOutput outA = new DataOutputStream(bosA); vAStats.write(outA); final byte[] payloadA = bosA.toByteArray(); MockDAGAppMaster mockApp = tezClient.getLocalClient().getMockApp(); MockContainerLauncher mockLauncher = mockApp.getContainerLauncher(); mockLauncher.startScheduling(false); mockApp.statsDelegate = new StatisticsDelegate() { @Override public TaskStatistics getStatistics(TaskSpec taskSpec) { byte[] payload = payloadA; TaskStatistics stats = new TaskStatistics(); final DataInputByteBuffer in = new DataInputByteBuffer(); in.reset(ByteBuffer.wrap(payload)); try { // this ensures that the serde code path is covered. stats.readFields(in); } catch (IOException e) { Assert.fail(e.getMessage()); } return stats; } }; mockApp.doSleep = false; DAGClient dagClient = tezClient.submitDAG(dag); mockLauncher.waitTillContainersLaunched(); DAGImpl dagImpl = (DAGImpl) mockApp.getContext().getCurrentDAG(); mockLauncher.startScheduling(true); DAGStatus status = dagClient.waitForCompletion(); Assert.assertEquals(DAGStatus.State.SUCCEEDED, status.getState()); Assert.assertEquals(numTasks, dagImpl.getVertex(vAName).getStatistics().getInputStatistics(0+vAName).getDataSize()); Assert.assertEquals(numTasks, dagImpl.getVertex(vAName).getStatistics().getInputStatistics(0+vAName).getItemsProcessed()); checkMemory(dag.getName(), mockApp); tezClient.stop(); }