org.apache.spark.api.java.JavaRDD#map

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}

Source File: SparkSharder.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static <L extends Locatable, SB extends ShardBoundary> JavaRDD<Shard<L>> shard(JavaSparkContext ctx, JavaRDD<L> locatables, Class<L> locatableClass,
                                                            SAMSequenceDictionary sequenceDictionary, JavaRDD<SB> intervals,
                                                            int maxLocatableLength, boolean useShuffle) {

    JavaRDD<ShardBoundary> paddedIntervals = intervals.map(ShardBoundary::paddedShardBoundary);
    if (useShuffle) {
        throw new UnsupportedOperationException("Shuffle not supported when sharding an RDD of intervals.");
    }
    return joinOverlapping(ctx, locatables, locatableClass, sequenceDictionary, paddedIntervals, maxLocatableLength,
            new MapFunction<Tuple2<ShardBoundary, Iterable<L>>, Shard<L>>() {
                private static final long serialVersionUID = 1L;
                @Override
                public Shard<L> call(Tuple2<ShardBoundary, Iterable<L>> value) {
                    return value._1().createShard(value._2());
                }
            });
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}

Source File: MarkDuplicatesSparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testReadsMissingReadGroups() {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname,
            true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);
    samRecordSetBuilder.addFrag("READ" , 0, 10000, false);

    JavaRDD<GATKRead> reads = ctx.parallelize(Lists.newArrayList(samRecordSetBuilder.getRecords()), 2).map(SAMRecordToGATKReadAdapter::new);
    reads = reads.map(r -> {r.setReadGroup(null); return r;});
    SAMFileHeader header = samRecordSetBuilder.getHeader();

    try {
        MarkDuplicatesSparkUtils.transformToDuplicateNames(header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, null, reads, 2, false).collect();
        Assert.fail("Should have thrown an exception");
    } catch (Exception e){
        Assert.assertTrue(e instanceof SparkException);
        Assert.assertTrue(e.getCause() instanceof UserException.ReadMissingReadGroup);
    }
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testInputMatrixBlockDML() {
	System.out.println("MLContextTest - input MatrixBlock DML");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Matrix m = new Matrix(dataFrame);
	MatrixBlock matrixBlock = m.toMatrixBlock();
	Script script = dml("avg = avg(M);").in("M", matrixBlock).out("avg");
	double avg = ml.execute(script).getDouble("avg");
	Assert.assertEquals(50.0, avg, 0.0);
}

Source File: InMemoryHashIndex.java From hudi with Apache License 2.0

6 votes

@Override
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, JavaSparkContext jsc,
    HoodieTable<T> hoodieTable) {
  return writeStatusRDD.map(new Function<WriteStatus, WriteStatus>() {
    @Override
    public WriteStatus call(WriteStatus writeStatus) {
      for (HoodieRecord record : writeStatus.getWrittenRecords()) {
        if (!writeStatus.isErrored(record.getKey())) {
          HoodieKey key = record.getKey();
          Option<HoodieRecordLocation> newLocation = record.getNewLocation();
          if (newLocation.isPresent()) {
            recordLocationMap.put(key, newLocation.get());
          } else {
            // Delete existing index for a deleted record
            recordLocationMap.remove(key);
          }
        }
      }
      return writeStatus;
    }
  });
}

Source File: TestLineRecordReaderFunction.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}

Source File: EntitySalienceFeatureExtractorSpark.java From ambiverse-nlu with Apache License 2.0

5 votes

/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}

Source File: SparkExport.java From DataVec with Apache License 2.0

5 votes

public static void exportCSVSpark(String directory, String delimiter, String quote, int outputSplits,
                JavaRDD<List<Writable>> data) {

    //NOTE: Order is probably not random here...
    JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote));
    lines.coalesce(outputSplits);

    lines.saveAsTextFile(directory);
}

Source File: DataFrameOps.java From toolbox with Apache License 2.0

5 votes

static JavaRDD<DataInstance> toDataInstanceRDD(DataFrame data, Attributes attributes) {

        JavaRDD<double[]> rawRDD = data.rdd()
                                  .toJavaRDD()
                                  .map( row -> transformRow2DataInstance(row, attributes) );

        return rawRDD.map(v ->  new DataInstanceFromDataRow( new DataRowSpark(v, attributes) ) );
    }

Source File: MLLibUtil.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Converts JavaRDD labeled points to JavaRDD DataSets.
 * @param data JavaRDD LabeledPoints
 * @param numPossibleLabels number of possible labels
 * @param preCache boolean pre-cache rdd before operation
 * @return
 */
public static JavaRDD<DataSet> fromLabeledPoint(JavaRDD<LabeledPoint> data, final long numPossibleLabels,
                boolean preCache) {
    if (preCache && !data.getStorageLevel().useMemory()) {
        data.cache();
    }
    return data.map(new Function<LabeledPoint, DataSet>() {
        @Override
        public DataSet call(LabeledPoint lp) {
            return fromLabeledPoint(lp, numPossibleLabels);
        }
    });
}

Source File: MLContextTest.java From systemds with Apache License 2.0

5 votes

@Test
public void testDataFrameToBinaryBlocks() {
	System.out.println("MLContextTest - DataFrame to binary blocks");

	List<String> list = new ArrayList<>();
	list.add("1,2,3");
	list.add("4,5,6");
	list.add("7,8,9");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = MLContextConversionUtil
			.dataFrameToMatrixBinaryBlocks(dataFrame);
	Tuple2<MatrixIndexes, MatrixBlock> first = binaryBlocks.first();
	MatrixBlock mb = first._2();
	double[][] matrix = DataConverter.convertToDoubleMatrix(mb);
	Assert.assertArrayEquals(new double[] { 1.0, 2.0, 3.0 }, matrix[0], 0.0);
	Assert.assertArrayEquals(new double[] { 4.0, 5.0, 6.0 }, matrix[1], 0.0);
	Assert.assertArrayEquals(new double[] { 7.0, 8.0, 9.0 }, matrix[2], 0.0);
}

Source File: BatchHeatMapProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Converts each row from the iotData  to a Measurement
 *
 * @param iotData | Spark SQL context
 * @return A set containing all data from the CSV file as Measurements
 */
private JavaRDD<Measurement> csvToMeasurements(JavaRDD<IoTData> iotData) {
    JavaRDD<Measurement> map = iotData.map(row -> {
        Coordinate coordinate = new Coordinate(
                Double.valueOf(row.getLatitude()),
                Double.valueOf(row.getLongitude())
        );
        return new Measurement(coordinate, row.getTimestamp());
    });
    return map;
}

Source File: PileupSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
protected void processAlignments(JavaRDD<LocusWalkerContext> rdd, JavaSparkContext ctx) {
    JavaRDD<String> lines = rdd.map(pileupFunction(metadata, outputInsertLength, showVerbose));
    if (numReducers != 0) {
        lines = lines.coalesce(numReducers);
    }
    lines.saveAsTextFile(outputFile);
}

Source File: AdaBoostMHLearner.java From sparkboost with Apache License 2.0

4 votes

protected void updateDistributionMatrix(JavaSparkContext sc, JavaRDD<MultilabelPoint> docs, double[][] localDM, WeakHypothesis localWH) {
    Broadcast<WeakHypothesis> distWH = sc.broadcast(localWH);
    Broadcast<double[][]> distDM = sc.broadcast(localDM);
    JavaRDD<DMPartialResult> partialResults = docs.map(doc -> {
        int[] validFeatures = doc.getFeatures().indices();
        HashMap<Integer, Integer> dictFeatures = new HashMap<>();
        for (int featID : validFeatures)
            dictFeatures.put(featID, featID);
        HashMap<Integer, Integer> dictLabels = new HashMap<>();
        for (int idx = 0; idx < doc.getLabels().length; idx++)
            dictLabels.put(doc.getLabels()[idx], doc.getLabels()[idx]);

        double[][] dm = distDM.getValue();
        WeakHypothesis wh = distWH.getValue();
        double[] labelsRes = new double[dm.length];
        for (int labelID = 0; labelID < dm.length; labelID++) {
            float catValue = 1;
            if (dictLabels.containsKey(labelID)) {
                catValue = -1;
            }

            // Compute the weak hypothesis value.
            double value = 0;
            WeakHypothesis.WeakHypothesisData v = wh.getLabelData(labelID);
            int pivot = v.getFeatureID();
            if (dictFeatures.containsKey(pivot))
                value = v.getC1();
            else
                value = v.getC0();


            double partialRes = dm[labelID][doc.getPointID()] * Math.exp(catValue * value);
            labelsRes[labelID] = partialRes;
        }

        return new DMPartialResult(doc.getPointID(), labelsRes);
    });

    Iterator<DMPartialResult> itResults = partialResults.toLocalIterator();
    // Update partial results.
    double normalization = 0;
    while (itResults.hasNext()) {
        DMPartialResult r = itResults.next();
        for (int labelID = 0; labelID < localDM.length; labelID++) {
            localDM[labelID][r.docID] = r.labelsRes[labelID];
            normalization += localDM[labelID][r.docID];
        }
    }

    // Normalize all values.
    for (int labelID = 0; labelID < localDM.length; labelID++) {
        for (int docID = 0; docID < localDM[0].length; docID++) {
            localDM[labelID][docID] = localDM[labelID][docID] / normalization;
        }
    }
}

Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java From ambiverse-nlu with Apache License 2.0

4 votes

/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entity", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}

Source File: JavaLatentDirichletAllocationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }

Source File: HierarchicalAlignmentClient.java From render with GNU General Public License v2.0

4 votes

private void createWarpStackForTier()
        throws IOException {

    LOG.info("createWarpStackForTier: entry");

    final ProcessTimer timer = new ProcessTimer();

    final Set<StackId> existingRoughProjectStackIds = new HashSet<>(driverRoughRender.getProjectStacks());

    final StackId warpStackId = HierarchicalStack.deriveWarpStackIdForTier(roughTilesStackId, currentTier);

    boolean generateWarpStack = true;
    if (existingRoughProjectStackIds.contains(warpStackId) &&
        parameters.keepExisting(PipelineStep.WARP)) {
        generateWarpStack = false;
    }

    if (generateWarpStack) {

        // remove any existing warp stack results
        driverRoughRender.deleteStack(warpStackId.getStack(), null);

        final StackMetaData roughTilesStackMetaData =
                driverRoughRender.getStackMetaData(roughTilesStackId.getStack());

        driverRoughRender.setupDerivedStack(roughTilesStackMetaData, warpStackId.getStack());

        final String projectForTier = this.tierProject;

        final JavaRDD<Double> rddZValues = sparkContext.parallelize(zValues);
        final HierarchicalWarpFieldStackFunction warpFieldStackFunction
                = new HierarchicalWarpFieldStackFunction(parameters.renderWeb.baseDataUrl,
                                                         parameters.renderWeb.owner,
                                                         currentTier,
                                                         projectForTier,
                                                         tierParentStackId,
                                                         warpStackId.getStack(),
                                                         parameters.consensusBuildMethod);

        final JavaRDD<Integer> rddTileCounts = rddZValues.map(warpFieldStackFunction);

        final List<Integer> tileCountList = rddTileCounts.collect();

        LOG.info("createWarpStackForTier: counting results");

        long total = 0;
        for (final Integer tileCount : tileCountList) {
            total += tileCount;
        }

        LOG.info("createWarpStackForTier: added {} tile specs to {}", total, warpStackId);

        driverRoughRender.setStackState(warpStackId.getStack(), StackMetaData.StackState.COMPLETE);
    }

    LOG.info("createWarpStackForTier: exit, processing took {} seconds", timer.getElapsedSeconds());
}

Source File: DataFrames.java From DataVec with Apache License 2.0

3 votes

/**
 * Creates a data frame from a collection of writables
 * rdd given a schema
 *
 * @param schema the schema to use
 * @param data   the data to convert
 * @return the dataframe object
 */
public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());
    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.map(new ToRow(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema)));
}

Source File: DataFrameOps.java From toolbox with Apache License 2.0

2 votes

static JavaRDD<Row> toRowRDD(JavaRDD<DataInstance> rawRDD, Attributes atts) {

        // FIXME: Categorical values should be inserted with their corresponding state name
        return rawRDD.map( v -> transformArray2RowAttributes(v, atts));

    }

Java Code Examples for org.apache.spark.api.java.JavaRDD#map()