org.apache.spark.TaskContext Java Examples

The following examples show how to use org.apache.spark.TaskContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCopyOnWriteActionExecutor.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testMakeNewPath() throws Exception {
  String fileName = UUID.randomUUID().toString();
  String partitionPath = "2016/05/04";

  String instantTime = HoodieTestUtils.makeNewCommitTime();
  HoodieWriteConfig config = makeHoodieClientConfig();
  metaClient = HoodieTableMetaClient.reload(metaClient);
  HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf);

  Pair<Path, String> newPathWithWriteToken = jsc.parallelize(Arrays.asList(1)).map(x -> {
    HoodieRecord record = mock(HoodieRecord.class);
    when(record.getPartitionPath()).thenReturn(partitionPath);
    String writeToken = FSUtils.makeWriteToken(TaskContext.getPartitionId(), TaskContext.get().stageId(),
        TaskContext.get().taskAttemptId());
    HoodieCreateHandle io = new HoodieCreateHandle(config, instantTime, table, partitionPath, fileName, supplier);
    return Pair.of(io.makeNewPath(record.getPartitionPath()), writeToken);
  }).collect().get(0);

  assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath,
      FSUtils.makeDataFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString());
}
 
Example #2
Source File: SpliceOutputCommitter.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void setupTask(TaskAttemptContext taskContext) throws IOException {

    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG,"setupTask");
    // Create child additive transaction so we don't read rows inserted by ourselves in this operation
    TaskContext sparkTaskContext = TaskContext.get();
    TaskId taskId = null;
    if (sparkTaskContext != null) {
        int stageId = sparkTaskContext.stageId();
        int partitionId = sparkTaskContext.partitionId();
        int attemptNumber = sparkTaskContext.attemptNumber();
        taskId = new TaskId(stageId, partitionId, attemptNumber);
    }
    TxnView txn = SIDriver.driver().lifecycleManager().beginChildTransaction(parentTxn, parentTxn.getIsolationLevel(),
            true, destinationTable, false, taskId);
    ActiveWriteTxn childTxn = new ActiveWriteTxn(txn.getTxnId(), txn.getTxnId(), parentTxn, true, parentTxn.getIsolationLevel(), taskId);
    currentTxn.set(childTxn);
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG,"beginTxn=%s and destinationTable=%s",childTxn,destinationTable);

}
 
Example #3
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}
 
Example #4
Source File: StreamNodeLoader.java    From sylph with Apache License 2.0 6 votes vote down vote up
private static Sink<JavaRDD<Row>> loadRealTimeSink(RealTimeSink realTimeSink)
{
    return (Sink<JavaRDD<Row>>) rdd -> rdd.foreachPartition(partition -> {
        Throwable errorOrNull = null;
        try {
            int partitionId = TaskContext.getPartitionId();
            boolean openOK = realTimeSink.open(partitionId, 0); //初始化 返回是否正常 如果正常才处理数据
            if (openOK) {
                partition.forEachRemaining(row -> realTimeSink.process(SparkRecord.make(row)));
            }
        }
        catch (Exception e) {
            errorOrNull = e; //open出错了
        }
        finally {
            realTimeSink.close(errorOrNull); //destroy()
        }
    });
}
 
Example #5
Source File: StreamNodeLoader.java    From sylph with Apache License 2.0 6 votes vote down vote up
public static Iterator<Row> transFunction(Iterator<Row> partition, RealTimeTransForm realTimeTransForm)
{
    Exception errorOrNull = null;
    Schema schema = realTimeTransForm.getSchema(); // if not null
    List<Row> list = new ArrayList<>();
    try {
        int partitionId = TaskContext.getPartitionId();
        if (realTimeTransForm.open(partitionId, 0)) {
            partition.forEachRemaining(row -> {
                realTimeTransForm.process(SparkRecord.make(row), (transOutrow) -> {
                    //TODO: SparkRow.parserRow(x) with schema ?
                    list.add(SparkRecord.parserRow(transOutrow));
                });
            });
        }
    }
    catch (Exception e) {
        errorOrNull = e; //转换失败 这批数据都丢弃
    }
    finally {
        realTimeTransForm.close(errorOrNull); //destroy()
    }
    return list.iterator();
}
 
Example #6
Source File: NLJoinFunction.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
protected void init(Iterator<ExecRow> from) throws StandardException {
    checkInit();
    taskContext = TaskContext.get();
    if (taskContext != null) {
        taskContext.addTaskCompletionListener((TaskCompletionListener) (t) -> close());
    }
    operationContext.getOperation().registerCloseable(this);
    SConfiguration configuration= EngineDriver.driver().getConfiguration();
    batchSize = configuration.getNestedLoopJoinBatchSize();
    nLeftRows = 0;
    leftSideIterator = from;
    executorService = SIDriver.driver().getExecutorService();
    firstBatch = new ArrayDeque<>(batchSize);

    initOperationContexts();
    loadBatch();
}
 
Example #7
Source File: ClusterFunctionProvider.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public static synchronized ClusterFunction getClusterFunction() throws Exception {
  // Why such a complex name?
  // When an executor dies and a new one takes its place, having just partition id won't work, because the old file
  // might not have been closed by the namenode since the old executor handling that partition may have just died.
  // So we must ensure a truly unique part which is executor id.
  // ---- BUT ----
  // Multiple partitions of the same job can run on the same executor, which is especially true now since we allow
  // the user to set fewer executors than partitions, so we need the partition id.
  // ---- BUT ----
  // Users could end up not making it unique enough, since partition id and executor id are not unique across jobs, so
  // if they use ${sdc:id()} in 2 cluster pipelines with same directory, then it will still collide, so prefix this
  // with pipeline id.
  // ---- DONE, YAY! ----
  if (clusterFunction == null) {
    clusterFunction =
        (ClusterFunction) BootstrapCluster.getClusterFunction(
            BootstrapCluster.getProperties().getProperty(ClusterModeConstants.CLUSTER_PIPELINE_NAME) +
                "-" +
                TaskContext.get().partitionId() + "-" +
                SparkEnv.get().executorId()
        );
  }
  return clusterFunction;
}
 
Example #8
Source File: SparkAMDSI.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public SparkAMDSI(MultiDataSetIterator iterator, int queueSize, BlockingQueue<MultiDataSet> queue,
                boolean useWorkspace, DataSetCallback callback, Integer deviceId) {
    this();

    if (queueSize < 2)
        queueSize = 2;

    this.callback = callback;
    this.buffer = queue;
    this.backedIterator = iterator;
    this.useWorkspaces = useWorkspace;
    this.prefetchSize = queueSize;
    this.workspaceId = "SAMDSI_ITER-" + java.util.UUID.randomUUID().toString();
    this.deviceId = deviceId;

    if (iterator.resetSupported())
        this.backedIterator.reset();

    this.thread = new SparkPrefetchThread(buffer, iterator, terminator, Nd4j.getAffinityManager().getDeviceForCurrentThread());

    context = TaskContext.get();

    thread.setDaemon(true);
    thread.start();
}
 
Example #9
Source File: CompatUtils.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
static void addOnCompletition(TaskContext taskContext, final Function0<?> function) {
    taskContext.addTaskCompletionListener(new TaskCompletionListener() {
        @Override
        public void onTaskCompletion(TaskContext context) {
            function.apply();
        }
    });
}
 
Example #10
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<scala.Tuple2<Source<T>, CheckpointMarkT>> compute(
    Partition split, TaskContext context) {
  @SuppressWarnings("unchecked")
  CheckpointableSourcePartition<T, CheckpointMarkT> partition =
      (CheckpointableSourcePartition<T, CheckpointMarkT>) split;
  scala.Tuple2<Source<T>, CheckpointMarkT> tuple2 =
      new scala.Tuple2<>(partition.getSource(), partition.checkpointMark);
  return JavaConversions.asScalaIterator(Collections.singleton(tuple2).iterator());
}
 
Example #11
Source File: HiveWarehouseDataReader.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
protected TaskAttemptID getTaskAttemptID(LlapInputSplit split, JobConf conf) throws IOException {
  //Get pseudo-ApplicationId to submit task attempt from external client
  SubmitWorkInfo submitWorkInfo = SubmitWorkInfo.fromBytes(split.getPlanBytes());
  ApplicationId appId = submitWorkInfo.getFakeAppId();
  JobID jobId = new JobID(Long.toString(appId.getClusterTimestamp()), appId.getId());
  //Create TaskAttemptID from Spark TaskContext (TaskType doesn't matter)
  return new TaskAttemptID(new TaskID(jobId, TaskType.MAP, TaskContext.get().partitionId()), TaskContext.get().attemptNumber());
}
 
Example #12
Source File: RemoteDPParForSparkWorker.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override 
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0)
	throws Exception 
{
	//lazy parworker initialization
	configureWorker( TaskContext.get().taskAttemptId() );

	//process all matrix partitions of this data partition
	MatrixBlock partition = null;
	while( arg0.hasNext() )
	{
		Tuple2<Long,Iterable<Writable>> larg = arg0.next();
		
		//collect input partition (check via equals because oinfo deserialized instance)
		if( _fmt == FileFormat.BINARY )
			partition = collectBinaryBlock( larg._2(), partition );
		else
			partition = collectBinaryCellInput( larg._2() );
		
		//update in-memory matrix partition
		MatrixObject mo = _ec.getMatrixObject( _inputVar );
		mo.setInMemoryPartition( partition );
		
		//create tasks for input data
		Task lTask = new Task(_iterVar, TaskType.SET);
		lTask.addIteration( new IntObject(larg._1()) );
		
		//execute program
		long numIter = getExecutedIterations();
		super.executeTask( lTask );
		
		//maintain accumulators
		_aTasks.add( 1 );
		_aIters.add( (int)(getExecutedIterations()-numIter) );
	}
	
	//write output if required (matrix indexed write)
	return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}
 
Example #13
Source File: RemoteParForSparkWorker.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override 
public Iterator<Tuple2<Long, String>> call(Task arg0)
	throws Exception 
{
	//lazy parworker initialization
	if( !_initialized )
		configureWorker(TaskContext.get().taskAttemptId());
	
	//keep input var names
	Set<String> inVars = new HashSet<>(_ec.getVariables().keySet());
	
	//execute a single task
	long numIter = getExecutedIterations();
	super.executeTask( arg0 );
	
	//maintain accumulators
	_aTasks.add( 1 );
	_aIters.add( (int)(getExecutedIterations()-numIter) );
	
	//cleanup remaining intermediate variables from buffer pool
	_ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v))
		.map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData)
		.forEach(c -> ((CacheableData<?>)c).freeEvictedBlob());
	
	//write output lineage of required
	if( DMLScript.LINEAGE )
		RemoteParForUtils.exportLineageItems(_workerID, 
			_ec.getVariables(), _resultVars, _ec.getLineage());
	
	//write output if required (matrix indexed write), incl cleanup pinned vars
	//note: this copy is necessary for environments without spark libraries
	return RemoteParForUtils
		.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}
 
Example #14
Source File: DeepRDD.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<T> compute(Partition split, TaskContext context) {

    initExtractorClient();

    extractorClient.initIterator(split, config.getValue());

    context.addTaskCompletionListener(new AbstractFunction1<TaskContext, BoxedUnit>() {

        @Override
        public BoxedUnit apply(TaskContext v1) {
            extractorClient.close();
            return null;
        }
    });

    java.util.Iterator<T> iterator = new java.util.Iterator<T>() {

        @Override
        public boolean hasNext() {
            return extractorClient.hasNext();
        }

        @Override
        public T next() {
            return extractorClient.next();
        }

        @Override
        public void remove() {
            throw new DeepIOException(
                    "Method not implemented (and won't be implemented anytime soon!!!)");
        }
    };

    return new InterruptibleIterator<>(context, asScalaIterator(iterator));

}
 
Example #15
Source File: SparkADSI.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public SparkADSI(DataSetIterator iterator, int queueSize, BlockingQueue<DataSet> queue, boolean useWorkspace,
                DataSetCallback callback, Integer deviceId) {
    this();

    if (queueSize < 2)
        queueSize = 2;

    this.deviceId = deviceId;
    this.callback = callback;
    this.useWorkspace = useWorkspace;
    this.buffer = queue;
    this.prefetchSize = queueSize;
    this.backedIterator = iterator;
    this.workspaceId = "SADSI_ITER-" + java.util.UUID.randomUUID().toString();

    if (iterator.resetSupported())
        this.backedIterator.reset();

    context = TaskContext.get();

    this.thread = new SparkPrefetchThread(buffer, iterator, terminator, null, Nd4j.getAffinityManager().getDeviceForCurrentThread());

    /**
     * We want to ensure, that background thread will have the same thread->device affinity, as master thread
     */

    thread.setDaemon(true);
    thread.start();
}
 
Example #16
Source File: RemoteDPParForSparkWorker.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override 
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0)
	throws Exception 
{
	//lazy parworker initialization
	configureWorker( TaskContext.get().taskAttemptId() );

	//process all matrix partitions of this data partition
	MatrixBlock partition = null;
	while( arg0.hasNext() )
	{
		Tuple2<Long,Iterable<Writable>> larg = arg0.next();
		
		//collect input partition (check via equals because oinfo deserialized instance)
		if( _oinfo.equals(OutputInfo.BinaryBlockOutputInfo) )
			partition = collectBinaryBlock( larg._2(), partition );
		else
			partition = collectBinaryCellInput( larg._2() );
		
		//update in-memory matrix partition
		MatrixObject mo = _ec.getMatrixObject( _inputVar );
		mo.setInMemoryPartition( partition );
		
		//create tasks for input data
		Task lTask = new Task(_iterVar, TaskType.SET);
		lTask.addIteration( new IntObject(larg._1()) );
		
		//execute program
		long numIter = getExecutedIterations();
		super.executeTask( lTask );
		
		//maintain accumulators
		_aTasks.add( 1 );
		_aIters.add( (int)(getExecutedIterations()-numIter) );
	}
	
	//write output if required (matrix indexed write)
	return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}
 
Example #17
Source File: IteratorUtils.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static <E> Iterator<E> asInterruptibleIterator(Iterator<E> it) {
    TaskContext context = TaskContext.get();
    if (context != null) {
        return (Iterator<E>) JavaConverters.asJavaIteratorConverter(new InterruptibleIterator(context, JavaConverters.asScalaIteratorConverter(it).asScala())).asJava();
    } else
        return it;
}
 
Example #18
Source File: SparkLeanOperationContext.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
@SuppressFBWarnings(value = "ST_WRITE_TO_STATIC_FROM_INSTANCE_METHOD", justification = "intended")
public void readExternal(ObjectInput in)
        throws IOException, ClassNotFoundException{
    if (in.readBoolean()) {
        SpliceClient.connectionString = in.readUTF();
        SpliceClient.setClient(HConfiguration.getConfiguration().getAuthenticationTokenEnabled(), SpliceClient.Mode.EXECUTOR);
    }
    badRecordsSeen = in.readLong();
    badRecordThreshold = in.readLong();
    permissive=in.readBoolean();
    SpliceSpark.setupSpliceStaticComponents();
    boolean isOp=in.readBoolean();
    if(isOp){
        broadcastedActivation = (BroadcastedActivation)in.readObject();
        ActivationHolder ah = broadcastedActivation.getActivationHolder();
        op=(Op)ah.getOperationsMap().get(in.readInt());
        activation = ah.getActivation();
        TaskContext taskContext = TaskContext.get();
        if (taskContext != null) {
            taskContext.addTaskCompletionListener((TaskCompletionListener)(ctx) -> ah.close());
        }
    }
    badRecordsAccumulator = (Accumulable<BadRecordsRecorder,String>) in.readObject();
    importFileName= (String) in.readObject();
    rowsWritten=(LongAccumulator)in.readObject();
}
 
Example #19
Source File: KafkaStreamer.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public Iterator<String> call(Integer partition, Iterator<T> locatedRowIterator) throws Exception {
    taskContext = TaskContext.get();

    if (taskContext != null && taskContext.attemptNumber() > 0) {
        LOG.trace("KS.c attempts "+taskContext.attemptNumber());
        long entriesInKafka = KafkaUtils.messageCount(bootstrapServers, topicName, partition);
        LOG.trace("KS.c entries "+entriesInKafka);
        for (long i = 0; i < entriesInKafka; ++i) {
            locatedRowIterator.next();
        }
    }

    Properties props = new Properties();
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
    props.put(ProducerConfig.CLIENT_ID_CONFIG, "spark-producer-dss-ks-"+UUID.randomUUID() );
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, IntegerSerializer.class.getName());
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ExternalizableSerializer.class.getName());
    KafkaProducer<Integer, Externalizable> producer = new KafkaProducer<>(props);
    int count = 0 ;
    while (locatedRowIterator.hasNext()) {
        T lr = locatedRowIterator.next();

        ProducerRecord<Integer, Externalizable> record = new ProducerRecord(topicName, count++, lr);
        producer.send(record);
        LOG.trace("KS.c sent "+partition.intValue()+" "+count+" "+lr);
    }
    LOG.trace("KS.c count "+partition.intValue()+" "+count);

    producer.close();
    // TODO Clean up
    return Arrays.asList("OK").iterator();
}
 
Example #20
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<WindowedValue<T>> compute(
    final Partition split, final TaskContext context) {
  final MetricsContainer metricsContainer = metricsAccum.value().getContainer(stepName);

  @SuppressWarnings("unchecked")
  final BoundedSource.BoundedReader<T> reader = createReader((SourcePartition<T>) split);

  final Iterator<WindowedValue<T>> readerIterator =
      new ReaderToIteratorAdapter<>(metricsContainer, reader);

  return new InterruptibleIterator<>(context, JavaConversions.asScalaIterator(readerIterator));
}
 
Example #21
Source File: RemoteParForSparkWorker.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override 
public Iterator<Tuple2<Long, String>> call(Task arg0)
	throws Exception 
{
	//lazy parworker initialization
	if( !_initialized )
		configureWorker(TaskContext.get().taskAttemptId());
	
	//keep input var names
	Set<String> inVars = new HashSet<>(_ec.getVariables().keySet());
	
	//execute a single task
	long numIter = getExecutedIterations();
	super.executeTask( arg0 );
	
	//maintain accumulators
	_aTasks.add( 1 );
	_aIters.add( (int)(getExecutedIterations()-numIter) );
	
	//cleanup remaining intermediate variables from buffer pool
	_ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v))
		.map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData)
		.forEach(c -> ((CacheableData<?>)c).freeEvictedBlob());
	
	//write output lineage of required
	if( DMLScript.LINEAGE )
		RemoteParForUtils.exportLineageItems(_workerID, 
			_ec.getVariables(), _resultVars, _ec.getLineage());
	
	//write output if required (matrix indexed write), incl cleanup pinned vars
	//note: this copy is necessary for environments without spark libraries
	return RemoteParForUtils
		.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}
 
Example #22
Source File: ExpKeyFilenameMap.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<Tuple2<Integer, String>> call(Iterator<Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>>> iter) throws Exception
{
  List<Tuple2<Integer,String>> keyFileList = new ArrayList<>();

  FileSystem fs = FileSystem.get(new Configuration());

  // Form the filename for the exp table portion that corresponds to this partition
  int taskId = TaskContext.getPartitionId();
  logger.info("taskId = " + taskId);

  String fileName = expOutDir + "/exp-" + String.format("%05d", taskId);
  logger.info("fileName = " + fileName);

  // Iterate over the elements of the partition
  BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(fileName), true)));
  while (iter.hasNext())
  {
    // <queryHash, <<power>,<element^power mod N^2>>
    Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>> expTuple = iter.next();
    int queryHash = expTuple._1;

    // Record the queryHash -> fileName
    keyFileList.add(new Tuple2<>(queryHash, fileName));

    // Write the partition elements to the corresponding exp table file
    // each line: queryHash,<power>-<element^power mod N^2>
    for (Tuple2<Integer,BigInteger> modPow : expTuple._2)
    {
      String lineOut = queryHash + "," + modPow._1 + "-" + modPow._2;
      bw.write(lineOut);
      bw.newLine();
    }
  }
  bw.close();

  return keyFileList.iterator();
}
 
Example #23
Source File: MizoRDD.java    From mizo with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<TReturn> compute(Partition split, TaskContext context) {
    String regionEdgesFamilyPath = this.regionsPaths.get(split.index());
    log.info("Running Mizo on region #{} located at: {}", split.index(), regionEdgesFamilyPath);

    return createRegionIterator(createRegionRelationsIterator(regionEdgesFamilyPath));
}
 
Example #24
Source File: SMRecordReaderImpl.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public void onTaskFailure(TaskContext context, Throwable error) {
	LOG.error("Task failed for split: " + split, error);
}
 
Example #25
Source File: SMRecordReaderImpl.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
public void init(Configuration config, InputSplit split) throws IOException, InterruptedException {	
	if (LOG.isDebugEnabled())
		SpliceLogUtils.debug(LOG, "init");
	if (TaskContext.get() != null) {
		TaskContext.get().addTaskFailureListener(this);
	}
	String tableScannerAsString = config.get(MRConstants.SPLICE_SCAN_INFO);
       if (tableScannerAsString == null)
		throw new IOException("splice scan info was not serialized to task, failing");
	byte[] scanStartKey = null;
	byte[] scanStopKey = null;
	try {
		builder = TableScannerBuilder.getTableScannerBuilderFromBase64String(tableScannerAsString);
		if (LOG.isTraceEnabled())
			SpliceLogUtils.trace(LOG, "config loaded builder=%s", builder);
		TableSplit tSplit = ((SMSplit) split).getSplit();
		token = builder.getToken();
		DataScan scan = builder.getScan();
		scanStartKey = scan.getStartKey();
		scanStopKey = scan.getStopKey();
		if (Bytes.startComparator.compare(scanStartKey, tSplit.getStartRow()) < 0) {
			// the split itself is more restrictive
			scan.startKey(tSplit.getStartRow());
		}
		if (Bytes.endComparator.compare(scanStopKey, tSplit.getEndRow()) > 0) {
			// the split itself is more restrictive
			scan.stopKey(tSplit.getEndRow());
		}
		setScan(((HScan) scan).unwrapDelegate());
		// TODO (wjk): this seems weird (added with DB-4483)
		this.statisticsRun = AbstractSMInputFormat.oneSplitPerRegion(config);
		Double sampling = AbstractSMInputFormat.sampling(config);
		if (sampling != null) {
			this.sampling = true;
			this.samplingRate = sampling;
		}
		restart(scan.getStartKey());
	} catch (IOException ioe) {
		LOG.error(String.format("Received exception with scan %s, original start key %s, original stop key %s, split %s",
				scan, Bytes.toStringBinary(scanStartKey), Bytes.toStringBinary(scanStopKey), split), ioe);
		throw ioe;
       } catch (StandardException e) {
		throw new IOException(e);
	}
}
 
Example #26
Source File: SparkFactDistinct.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}
 
Example #27
Source File: KafkaReadFunction.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public Iterator<ExecRow> call(Integer partition) throws Exception {
    Properties props = new Properties();

    props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);

    String consumer_id = "spark-consumer-dss-krf-"+UUID.randomUUID();
    props.put(ConsumerConfig.GROUP_ID_CONFIG, consumer_id);
    props.put(ConsumerConfig.CLIENT_ID_CONFIG, consumer_id);

    props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, IntegerDeserializer.class.getName());
    props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ExternalizableDeserializer.class.getName());
    props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");

    KafkaConsumer<Integer, Externalizable> consumer = new KafkaConsumer<Integer, Externalizable>(props);
    consumer.assign(Arrays.asList(new TopicPartition(topicName, partition)));

    return new Iterator<ExecRow>() {
        Iterator<ConsumerRecord<Integer, Externalizable>> it = null;

        @Override
        public boolean hasNext() {
            if (it == null) {
                ConsumerRecords<Integer, Externalizable> records = null;
                while (records == null || records.isEmpty()) {
                    records = consumer.poll( java.time.Duration.ofMillis(1000) );
                    if (TaskContext.get().isInterrupted()) {
                        consumer.close();
                        throw new TaskKilledException();
                    }
                }
                it = records.iterator();
            }
            if (it.hasNext()) {
                return true;
            }
            else {
                consumer.close();
                return false;
            }
        }

        @Override
        public ExecRow next() {
            return (ExecRow)it.next().value();
        }
    };
}
 
Example #28
Source File: SparkJavaRDD.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
@Override
public Iterator<T> iterator(final Partition split, final TaskContext taskContext) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}
 
Example #29
Source File: RowDataRewriter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private TaskResult rewriteDataForTask(CombinedScanTask task) throws Exception {
  TaskContext context = TaskContext.get();
  int partitionId = context.partitionId();
  long taskId = context.taskAttemptId();

  RowDataReader dataReader = new RowDataReader(
      task, schema, schema, nameMapping, io.value(), encryptionManager.value(), caseSensitive);

  SparkAppenderFactory appenderFactory = new SparkAppenderFactory(
      properties, schema, SparkSchemaUtil.convert(schema));
  OutputFileFactory fileFactory = new OutputFileFactory(
      spec, format, locations, io.value(), encryptionManager.value(), partitionId, taskId);

  BaseWriter writer;
  if (spec.fields().isEmpty()) {
    writer = new UnpartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE);
  } else {
    writer = new PartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE, schema);
  }

  try {
    while (dataReader.next()) {
      InternalRow row = dataReader.get();
      writer.write(row);
    }

    dataReader.close();
    dataReader = null;
    return writer.complete();

  } catch (Throwable originalThrowable) {
    try {
      LOG.error("Aborting task", originalThrowable);
      context.markTaskFailed(originalThrowable);

      LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})",
          partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber());
      if (dataReader != null) {
        dataReader.close();
      }
      writer.abort();
      LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})",
          partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber());

    } catch (Throwable inner) {
      if (originalThrowable != inner) {
        originalThrowable.addSuppressed(inner);
        LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner);
      }
    }

    if (originalThrowable instanceof Exception) {
      throw originalThrowable;
    } else {
      throw new RuntimeException(originalThrowable);
    }
  }
}
 
Example #30
Source File: VideoStreamProcessor.java    From video-stream-classification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
//Read properties
Properties prop = PropertyFileReader.readPropertyFile();

//SparkSesion
SparkSession spark = SparkSession
	      .builder()
	      .appName("VideoStreamProcessor")
	      .master(prop.getProperty("spark.master.url"))
	      .getOrCreate();	

//directory to save image files with motion detected
final String processedImageDir = prop.getProperty("processed.output.dir");
logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file.");

//create schema for json message
StructType schema =  DataTypes.createStructType(new StructField[] { 
		DataTypes.createStructField("cameraId", DataTypes.StringType, true),
		DataTypes.createStructField("timestamp", DataTypes.TimestampType, true),
		DataTypes.createStructField("rows", DataTypes.IntegerType, true),
		DataTypes.createStructField("cols", DataTypes.IntegerType, true),
		DataTypes.createStructField("type", DataTypes.IntegerType, true),
		DataTypes.createStructField("data", DataTypes.StringType, true)
		});


//Create DataSet from stream messages from kafka
   Dataset<VideoEventData> ds = spark
     .readStream()
     .format("kafka")
     .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers"))
     .option("subscribe", prop.getProperty("kafka.topic"))
     .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes"))
     .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records"))
     .load()
     .selectExpr("CAST(value AS STRING) as message")
     .select(functions.from_json(functions.col("message"),schema).as("json"))
     .select("json.*")
     .as(Encoders.bean(VideoEventData.class)); 
   
   //key-value pair of cameraId-VideoEventData
KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() {
	@Override
	public String call(VideoEventData value) throws Exception {
		return value.getCameraId();
	}
}, Encoders.STRING());
	
//process
Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){
	@Override
	public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception {
		logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId());
		VideoEventData existing = null;
		//check previous state
		if (state.exists()) {
			existing = state.get();
		}
		//classify image
		VideoEventData processed = ImageProcessor.process(key,values,processedImageDir,existing);
		
		//update last processed
		if(processed != null){
			state.update(processed);
		}
		return processed;
	}}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class));

//start
 StreamingQuery query = processedDataset.writeStream()
	      .outputMode("update")
	      .format("console")
	      .start();
 
 //await
    query.awaitTermination();
}