Java Code Examples for org.apache.spark.api.java.JavaPairRDD#count()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#count() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
/** * Check invalid indexes which do not refer to a base table row * @param table * @param index * @return * @throws StandardException */ private List<String> checkInvalidIndexes(PairDataSet table, PairDataSet index) throws StandardException { List<String> messages = Lists.newLinkedList(); SpliceSpark.pushScope(String.format("Check invalidate index from %s.%s", schemaName, indexName)); PairDataSet<String, Tuple2<byte[], ExecRow>> d1 = index.subtractByKey(table, null); SpliceSpark.popScope(); JavaPairRDD rdd = ((SparkPairDataSet) d1).rdd; invalidIndexCount = rdd.count(); if (invalidIndexCount > 0) { if (fix) { return fixInvalidIndexes(rdd); } else { return reportInvalidIndexes(rdd); } } return messages; }
Example 2
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public void cacheMatrixObject( String var ) { //get input rdd and default storage level MatrixObject mo = getMatrixObject(var); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) ) return; JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo); //persist rdd (force rdd caching, if not already cached) if( !isRDDCached(in.id()) ) in.count(); //trigger caching to prevent contention }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public void cacheMatrixObject( String var ) { //get input rdd and default storage level MatrixObject mo = getMatrixObject(var); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) ) return; JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, FileFormat.BINARY); //persist rdd (force rdd caching, if not already cached) if( !isRDDCached(in.id()) ) in.count(); //trigger caching to prevent contention }
Example 4
Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Test public void test4() throws IOException { List<String> pdbIds = Arrays.asList("1BZ5");//D5 JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); /* * C5-B4 * C6-B3 * D7-A2 * D8-A1 * E10-E9 */ pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(9, 10, false, true)); long count = pdb.count(); assertEquals(5, count); }
Example 5
Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Test public void test3() throws IOException { List<String> pdbIds = Arrays.asList("4GIS");//D4 /* * A3-A2 * A4-A1 * B5-A1 * B6-A2 * B6-B5 * B7-A3 * B7-A4 * B8-A4 * B8-B7 */ JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true)); long count = pdb.count(); assertEquals(9, count); }
Example 6
Source File: JavaTC.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTC") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2; JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.mapToPair( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<>(e._2(), e._1()); } }); long oldCount; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); spark.stop(); }
Example 7
Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
/** * Check base table whether all rows are indexed * @param table * @param index * @return * @throws StandardException */ private List<String> checkMissingIndexes(PairDataSet table, PairDataSet index) throws StandardException { List<String> messages = Lists.newLinkedList(); SpliceSpark.pushScope(String.format("Check unindexed rows from table %s.%s", schemaName, tableName)); PairDataSet<String, ExecRow> d2 = table.subtractByKey(index, null); JavaPairRDD rdd = ((SparkPairDataSet)d2).rdd; missingIndexCount = rdd.count(); if (missingIndexCount > 0) { messages = reportMissingIndexes(rdd, fix); } return messages; }
Example 8
Source File: HoodieBloomIndex.java From hudi with Apache License 2.0 | 5 votes |
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { // Step 0: cache the input record RDD if (config.getBloomIndexUseCaching()) { recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps())); } // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); // Cache the result, for subsequent stages. if (config.getBloomIndexUseCaching()) { keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } if (LOG.isDebugEnabled()) { long totalTaggedRecords = keyFilenamePairRDD.count(); LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Cost: 4 sec. JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD); if (config.getBloomIndexUseCaching()) { recordRDD.unpersist(); // unpersist the input Record RDD keyFilenamePairRDD.unpersist(); } return taggedRecordRDD; }
Example 9
Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0 | 5 votes |
@Test public void test2() throws IOException { List<String> pdbIds = Arrays.asList("5NV3"); // D4 JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true)); long count = pdb.count(); assertEquals(12, count); }
Example 10
Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0 | 5 votes |
@Test public void test1() throws IOException { List<String> pdbIds = Arrays.asList("1I1G"); // D4 JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true)); long count = pdb.count(); assertEquals(4, count); }
Example 11
Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") }; SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); logger.info("RDD input path: {}", inputPath); logger.info("RDD Output path: {}", outputPath); final Job job = Job.getInstance(sConf.get()); SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl); int countMeasureIndex = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { if (measureDesc.getFunction().isCount() == true) { break; } else { countMeasureIndex++; } } final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig); boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()]; boolean allNormalMeasure = true; for (int i = 0; i < cubeDesc.getMeasures().size(); i++) { needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid(); allNormalMeasure = allNormalMeasure && needAggr[i]; } logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure); StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel()); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable) .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); Long totalCount = 0L; if (envConfig.isSparkSanityCheckEnabled()) { totalCount = encodedBaseRDD.count(); } final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl, sConf); BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction; if (allNormalMeasure == false) { reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr); } final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel(); JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1]; int level = 0; int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); // aggregate to calculate base cuboid allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel); saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig); PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf); // aggregate to ND cuboids for (level = 1; level <= totalLevels; level++) { partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition) .persist(storageLevel); allRDDs[level - 1].unpersist(false); if (envConfig.isSparkSanityCheckEnabled() == true) { sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex); } saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig); } allRDDs[totalLevels].unpersist(false); logger.info("Finished on calculating all level cuboids."); logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten()); //HadoopUtil.deleteHDFSMeta(metaUrl); }
Example 12
Source File: SparkCubingByLayer.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") }; SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); logger.info("RDD input path: {}", inputPath); logger.info("RDD Output path: {}", outputPath); final Job job = Job.getInstance(sConf.get()); SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl); int countMeasureIndex = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { if (measureDesc.getFunction().isCount() == true) { break; } else { countMeasureIndex++; } } final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig); boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()]; boolean allNormalMeasure = true; for (int i = 0; i < cubeDesc.getMeasures().size(); i++) { needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid(); allNormalMeasure = allNormalMeasure && needAggr[i]; } logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure); StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel()); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable) .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); Long totalCount = 0L; if (envConfig.isSparkSanityCheckEnabled()) { totalCount = encodedBaseRDD.count(); } final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl, sConf); BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction; if (allNormalMeasure == false) { reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr); } final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel(); JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1]; int level = 0; int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); // aggregate to calculate base cuboid allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel); saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig); PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf); // aggregate to ND cuboids for (level = 1; level <= totalLevels; level++) { partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition) .persist(storageLevel); allRDDs[level - 1].unpersist(false); if (envConfig.isSparkSanityCheckEnabled() == true) { sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex); } saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig); } allRDDs[totalLevels].unpersist(false); logger.info("Finished on calculating all level cuboids."); logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten()); //HadoopUtil.deleteHDFSMeta(metaUrl); }
Example 13
Source File: SparkColumnCardinality.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME); String output = optionsHelper.getOptionValue(OPTION_OUTPUT); int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT)); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName); //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException" conf.set("spark.sql.catalogImplementation", "hive"); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output)); // table will be loaded by spark sql, so isSequenceFile set false final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName); JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter()) .reduceByKey((x, y) -> { x.merge(y); return x; }) .mapToPair(record -> { return new Tuple2<>(record._1, record._2.getCountEstimate()); }) .sortByKey(true, 1) .cache(); if (resultRdd.count() == 0) { ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>(); for (int i = 0; i < columnCnt; ++i) { list.add(new Tuple2<>(i, 0L)); } JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1); nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class); } else { resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class); } } }
Example 14
Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level // Ideally, we should ensure that we donot redundantly call persist on the same RDD. StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK(); //cache right hand side because accessed many times in2 = in2.repartition(sec.getSparkContext().defaultParallelism()) .persist(pmapmmStorageLevel); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) { //create broadcast for rdd partition JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1 .filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1)) .mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize())); int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize()); PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L); Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb); //matrix multiplication JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2 .flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize())); rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false); rdd2.persist(pmapmmStorageLevel) .count(); bpmb.unpersist(false); if( out == null ) out = rdd2; else out = out.union(rdd2); } //cache final result out = out.persist(pmapmmStorageLevel); out.count(); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, true); }
Example 15
Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level // Ideally, we should ensure that we donot redundantly call persist on the same RDD. StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK(); //cache right hand side because accessed many times in2 = in2.repartition(sec.getSparkContext().defaultParallelism()) .persist(pmapmmStorageLevel); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) { //create broadcast for rdd partition JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1 .filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1)) .mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize())); int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize()); PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L); Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb); //matrix multiplication JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2 .flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize())); rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false); rdd2.persist(pmapmmStorageLevel) .count(); bpmb.unpersist(false); if( out == null ) out = rdd2; else out = out.union(rdd2); } //cache final result out = out.persist(pmapmmStorageLevel); out.count(); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, true); }
Example 16
Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0 | 4 votes |
@Test public void testCreateDataFrame() throws Exception { // Set up Spark final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession(); final SparkContext context = session.sparkContext(); // ingest test points TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); final SqlQueryRunner queryRunner = new SqlQueryRunner(); queryRunner.setSparkSession(session); try { // Load RDD from datastore, no filters final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions()); final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD(); final long count = javaRdd.count(); LOGGER.warn("DataStore loaded into RDD with " + count + " features."); queryRunner.addInputStore(dataStore, null, "features"); final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))"; queryRunner.setSql( "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)"); Dataset<Row> results = queryRunner.run(); final long containsCount = results.count(); LOGGER.warn("Got " + containsCount + " for GeomContains test"); queryRunner.setSql( "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))"); results = queryRunner.run(); final long withinCount = results.count(); LOGGER.warn("Got " + withinCount + " for GeomWithin test"); Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount); // Test the output writer final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore); sqlResultsWriter.writeResults("sqltest"); queryRunner.removeAllStores(); // Test other spatial UDFs final String line1 = "LINESTRING(0 0, 10 10)"; final String line2 = "LINESTRING(0 10, 10 0)"; queryRunner.setSql( "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); Row result = queryRunner.run().head(); final boolean intersect = result.getBoolean(0); LOGGER.warn("GeomIntersects returned " + intersect); Assert.assertTrue("Lines should intersect", intersect); queryRunner.setSql( "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); result = queryRunner.run().head(); final boolean disjoint = result.getBoolean(0); LOGGER.warn("GeomDisjoint returned " + disjoint); Assert.assertFalse("Lines should not be disjoint", disjoint); } catch (final Exception e) { e.printStackTrace(); TestUtils.deleteAll(dataStore); Assert.fail( "Error occurred while testing a bounding box query of spatial index: '" + e.getLocalizedMessage() + "'"); } // Clean up TestUtils.deleteAll(dataStore); }
Example 17
Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0 | 4 votes |
/** * Phase I : pre-processing for RP-DBSCAN. * Phase I-1 (Pseudo Random Partitioning) and Phase I-2 (Cell_Dictionary_Building & Broadcasting) */ public void phaseI() { /** * Phase I-1. Pseudo Random Partitioning */ //Read input data set from HDFS JavaRDD<String> lines = sc.textFile(Conf.inputPath, Conf.numOfPartitions); JavaPairRDD<List<Integer>, ApproximatedCell> dataMap = null; //Data partitioning if(Conf.boost) { dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)) .combineByKey(new Methods.CreateLocalApproximatedPoint(Conf.dim, Conf.epsilon, Conf.rho), new Methods.LocalApproximation(Conf.dim, Conf.epsilon, Conf.rho), new Methods.GlobalApproximation(Conf.dim)) .mapToPair(new Methods.PseudoRandomPartition2(Conf.metaBlockWindow)).persist(StorageLevel.MEMORY_AND_DISK_SER()); }else dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)).groupByKey().mapToPair(new Methods.PseudoRandomPartition(Conf.dim, Conf.epsilon, Conf.rho, Conf.metaBlockWindow, Conf.pairOutputPath)).persist(StorageLevel.MEMORY_AND_DISK_SER()); numOfCells = dataMap.count(); /** * Phase I-2. Cell_Dictionary_Building & Broadcasting */ //Dictionary Defragmentation JavaPairRDD<List<Integer>, Long> ptsCountforEachMetaBlock = dataMap.mapToPair(new Methods.MetaBlockMergeWithApproximation()).reduceByKey(new Methods.AggregateCount()); List<Tuple2<List<Integer>, Long>> numOfPtsInCell = ptsCountforEachMetaBlock.collect(); //System.out.println("# of Blocks for virtually combining : " + numOfPtsInCell.size()); HashMap<List<Integer>,List<Integer>> partitionIndex = new HashMap<List<Integer>,List<Integer>>(); Tuple2<Long, List<Partition>> metaInfoForVirtualCombining = Methods.scalablePartition(numOfPtsInCell, Conf.dim, Conf.numOflvhCellsInMetaPartition/Conf.dim, partitionIndex); numOfSubCells = metaInfoForVirtualCombining._1; List<Partition> wholePartitions = metaInfoForVirtualCombining._2; numOfSubDictionaries = wholePartitions.size(); //Build Two-Level Cell Dictionary composed of multiple sub-dictionaries JavaPairRDD<Integer, Iterable<ApproximatedCell>> evenlySplitPartitions = dataMap.flatMapToPair(new Methods.AssignApproximatedPointToPartition(partitionIndex)).groupByKey(wholePartitions.size()); JavaPairRDD<Null, Null> metaDataSet = evenlySplitPartitions.mapToPair(new Methods.MetaGenerationWithApproximation(Conf.dim, Conf.epsilon, Conf.rho, Conf.minPts, conf, wholePartitions)); metaDataSet.collect(); //Re-partition the pseudo random partitions into Each Worker by a randomly assigned integer value for reducing the size of memory usage. dataset = dataMap.mapToPair(new Methods.Repartition(Conf.numOfPartitions)).repartition(Conf.numOfPartitions).persist(StorageLevel.MEMORY_AND_DISK_SER()); //Broadcast two-level cell dictionary to every workers. try { metaPaths = FileIO.broadCastData(sc, conf, Conf.metaFoler); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 18
Source File: SparkColumnCardinality.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME); String output = optionsHelper.getOptionValue(OPTION_OUTPUT); int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT)); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName); //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException" conf.set("spark.sql.catalogImplementation", "hive"); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output)); // table will be loaded by spark sql, so isSequenceFile set false final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName); JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter()) .reduceByKey((x, y) -> { x.merge(y); return x; }) .mapToPair(record -> { return new Tuple2<>(record._1, record._2.getCountEstimate()); }) .sortByKey(true, 1) .cache(); if (resultRdd.count() == 0) { ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>(); for (int i = 0; i < columnCnt; ++i) { list.add(new Tuple2<>(i, 0L)); } JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1); nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class); } else { resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class); } } }
Example 19
Source File: ModelReplicaSplit.java From OpenDL with Apache License 2.0 | 2 votes |
/** * Split the input samples (one each split for one ModelReplica) * * @param input * @param nrModelReplica * @param cache * @return */ public JavaPairRDD<Integer, List<T>> split(JavaRDD<T> input, int nrModelReplica, SGDTrainConfig config) { JavaPairRDD<Integer, List<T>> output = input.map(new SplitModelReplica(nrModelReplica)).groupByKey().persist(config.getMrDataStorage()); output.count(); return output; }