Java Code Examples for org.apache.spark.api.java.JavaRDD#first()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#first() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Chapter4.java From sparkResearch with Apache License 2.0 | 5 votes |
/** * flatMap分割字符串 */ public void flatMap(JavaSparkContext sparkContext){ JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi")); JavaRDD<String> flatMapResult = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { return Arrays.asList(PATTERN.split(s)).iterator(); } }); flatMapResult.first(); //结果:hello }
Example 2
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue) { //determine unknown dimensions and sparsity if required if( !mc.dimsKnown() ) { //nnz irrelevant here JavaRDD<String> tmp = input.values() .map(new TextToStringFunction()); String tmpStr = tmp.first(); boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) || tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX); tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr; long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0); long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length; mc.set(rlen, clen, mc.getBlocksize(), -1); } //prepare csv w/ row indexes (sorted by filenames) JavaPairRDD<Text,Long> prepinput = input.values() .zipWithIndex(); //zip row index //prepare default schema if needed if( schema == null || schema.length==1 ) schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING); //convert csv rdd to binary block rdd (w/ partial blocks) JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair( new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim)); return out; }
Example 3
Source File: ActionRDD.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 取第一个元素. * * @since hui_project 1.0.0 */ public void testFirst() { SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH); String first = stringJavaRDD.first(); System.out.println(first); }
Example 4
Source File: ActionRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 取第一个元素. * * @since hui_project 1.0.0 */ @Test public void testFirst() { JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH); String first = stringJavaRDD.first(); System.out.println(first); }
Example 5
Source File: JavaRDDToDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Converts a JavaRDD<Row> to a Dataset<Row>. This method only * supports simple data types and all data need to be not null. * * @param data JavaRDD of Row objects * @param colNames names of the columns in a row * @return */ public static Dataset<Row> getDataset(JavaRDD<Row> data, String...colNames) { // create the schema for the dataset Row row = data.first(); int length = row.length(); if (length != colNames.length) { throw new IllegalArgumentException("colNames length does not match row length"); } StructField[] sf = new StructField[length]; for (int i = 0; i < row.size(); i++) { Object o = row.get(i); // TODO add more types if (o instanceof String) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.StringType, false); } else if (o instanceof Integer) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.IntegerType, false); } else if (o instanceof Long) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.LongType, false); } else if (o instanceof Float) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.FloatType, false); } else if (o instanceof Double) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.DoubleType, false); } else if (o instanceof Boolean) { sf[i] = DataTypes.createStructField(colNames[i], DataTypes.BooleanType, false); } else { System.out.println("Data type not implemented yet"); } } StructType schema = new StructType(sf); // convert JavaRDD to Dataset SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(data, schema); }
Example 6
Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Tokenizes a line. * @param line a line of text * @return a result text string */ public String tokenizeOneLine(String line) { List<String> list = new ArrayList<String>(); list.add(line); JavaRDD<String> input = jsc.parallelize(list); JavaRDD<String> output = tokenize(input); return output.first(); }
Example 7
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue) { //determine unknown dimensions and sparsity if required if( !mc.dimsKnown() ) { //nnz irrelevant here JavaRDD<String> tmp = input.values() .map(new TextToStringFunction()); String tmpStr = tmp.first(); boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) || tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX); tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr; long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0); long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length; mc.set(rlen, clen, mc.getBlocksize(), -1); } //prepare csv w/ row indexes (sorted by filenames) JavaPairRDD<Text,Long> prepinput = input.values() .zipWithIndex(); //zip row index //prepare default schema if needed if( schema == null || schema.length==1 ) schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING); //convert csv rdd to binary block rdd (w/ partial blocks) JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair( new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim)); return out; }
Example 8
Source File: DeepSparkContext.java From deep-spark with Apache License 2.0 | 5 votes |
/** * Creates a JavaSchemaRDD from a DeepJobConfig and a JavaSQLContext. * @param config Specific Deep ExtractorConfig. * @return A JavaSchemaRDD built from Cells. * @throws UnsupportedDataTypeException */ public DataFrame createJavaSchemaRDD(ExtractorConfig<Cells> config) throws UnsupportedDataTypeException, UnsupportedOperationException { JavaRDD<Cells> cellsRDD = createJavaRDD(config); JavaRDD<Row> rowsRDD = DeepSparkContext.createJavaRowRDD(cellsRDD); try { Cells firstCells = cellsRDD.first(); StructType schema = CellsUtils.getStructTypeFromCells(firstCells); return sqlContext.applySchema(rowsRDD, schema); } catch(UnsupportedOperationException e) { throw new UnsupportedOperationException("Cannot infer schema from empty data RDD", e); } }
Example 9
Source File: PSScorerTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Test(dataProvider = "mapPairs", groups = "spark") public void testMapGroupedReadsToTax(final int readLength, final List<Integer> NM1, final List<Integer> NM2, final List<Integer> clip1, final List<Integer> clip2, final List<Integer> insert1, final List<Integer> insert2, final List<Integer> delete1, final List<Integer> delete2, final List<String> contig1, final List<String> contig2, final List<Integer> truthTax) { final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase); //Test with alternate alignments assigned to the XA tag final List<Iterable<GATKRead>> readListXA = new ArrayList<>(); readListXA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "XA")); final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoXA = resultXA.first()._2; Assert.assertNotNull(infoXA); Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoXA.numMates, 2); //Test SA tag final List<Iterable<GATKRead>> readListSA = new ArrayList<>(); readListSA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "SA")); final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoSA = resultSA.first()._2; Assert.assertNotNull(infoSA); Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoSA.numMates, 2); }
Example 10
Source File: PSScorerTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Test(dataProvider = "mapUnpaired", groups = "spark") public void testMapGroupedReadsToTaxUnpaired(final int readLength, final List<Integer> NM, final List<Integer> clip, final List<Integer> insert, final List<Integer> delete, final List<String> contig, final List<Integer> truthTax) { if (!(NM.size() == clip.size() && NM.size() == insert.size() && NM.size() == delete.size() && NM.size() == contig.size())) { throw new TestException("Input lists for read must be of uniform length"); } final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase); //Test with alternate alignments assigned to the XA tag final List<Iterable<GATKRead>> readListXA = new ArrayList<>(); readListXA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "XA")); final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoXA = resultXA.first()._2; Assert.assertNotNull(infoXA); Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoXA.numMates, 1); //Test SA tag final List<Iterable<GATKRead>> readListSA = new ArrayList<>(); readListSA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "SA")); final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoSA = resultSA.first()._2; Assert.assertNotNull(infoSA); Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoSA.numMates, 1); }
Example 11
Source File: CassandraCellExtractorFT.java From deep-spark with Apache License 2.0 | 4 votes |
@Override protected void initDataSetDivineComedy(DeepSparkContext context) { JavaRDD<String> stringJavaRDD; //Divine Comedy List<String> lineas = readFile("/simpleDivineComedy.json"); stringJavaRDD = context.parallelize(lineas); JavaRDD javaRDD = transformRDD(stringJavaRDD, Cells.class); originBook = javaRDD.first(); DeepSparkContext.saveRDD(javaRDD.rdd(), getWriteExtractorConfig(BOOK_INPUT, Cells.class)); }
Example 12
Source File: CassandraEntityExtractorFT.java From deep-spark with Apache License 2.0 | 4 votes |
@Override protected void initDataSetDivineComedy(DeepSparkContext context) { JavaRDD<String> stringJavaRDD; //Divine Comedy List<String> lineas = readFile("/simpleDivineComedy.json"); stringJavaRDD = context.parallelize(lineas); JavaRDD javaRDD = transformRDD(stringJavaRDD, SimpleBookEntity.class); originBook = javaRDD.first(); DeepSparkContext.saveRDD(javaRDD.rdd(), getWriteExtractorConfig(BOOK_INPUT, SimpleBookEntity.class)); }
Example 13
Source File: ExtractorTest.java From deep-spark with Apache License 2.0 | 3 votes |
protected void initDataSetDivineComedy(DeepSparkContext context) { JavaRDD<String> stringJavaRDD; //Divine Comedy List<String> lineas = readFile(DATA_TEST_DIVINE_COMEDY); stringJavaRDD = context.parallelize(lineas); JavaRDD<T> javaRDD = transformRDD(stringJavaRDD, configEntity); originBook = javaRDD.first(); DeepSparkContext.saveRDD(javaRDD.rdd(), (ExtractorConfig<T>) getWriteExtractorConfig(BOOK_INPUT, configEntity)); }