org.apache.spark.api.java.JavaPairRDD#count

Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0

6 votes

/**
 * Check invalid indexes which do not refer to a base table row
 * @param table
 * @param index
 * @return
 * @throws StandardException
 */
private List<String> checkInvalidIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    List<String> messages = Lists.newLinkedList();
    SpliceSpark.pushScope(String.format("Check invalidate index from %s.%s", schemaName, indexName));
    PairDataSet<String, Tuple2<byte[], ExecRow>> d1 = index.subtractByKey(table, null);
    SpliceSpark.popScope();
    JavaPairRDD rdd = ((SparkPairDataSet) d1).rdd;
    invalidIndexCount = rdd.count();
    if (invalidIndexCount > 0) {
        if (fix) {
            return fixInvalidIndexes(rdd);
        } else {
            return reportInvalidIndexes(rdd);
        }
    }
    return messages;
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public void cacheMatrixObject( String var ) {
	//get input rdd and default storage level
	MatrixObject mo = getMatrixObject(var);

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) )
		return;

	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//persist rdd (force rdd caching, if not already cached)
	if( !isRDDCached(in.id()) )
		in.count(); //trigger caching to prevent contention
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public void cacheMatrixObject( String var ) {
	//get input rdd and default storage level
	MatrixObject mo = getMatrixObject(var);

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) )
		return;

	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//persist rdd (force rdd caching, if not already cached)
	if( !isRDDCached(in.id()) )
		in.count(); //trigger caching to prevent contention
}

Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0

6 votes

@Test
public void test4() throws IOException {
    List<String> pdbIds = Arrays.asList("1BZ5");//D5
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
    /*
     * C5-B4
     * C6-B3
     * D7-A2
     * D8-A1
     * E10-E9
     */
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(9, 10, false, true));
    long count = pdb.count();
    assertEquals(5, count);
}

Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0

6 votes

@Test
public void test3() throws IOException {
    List<String> pdbIds = Arrays.asList("4GIS");//D4
    /*
     * A3-A2
     * A4-A1
     * B5-A1
     * B6-A2
     * B6-B5
     * B7-A3
     * B7-A4
     * B8-A4
     * B8-B7
     */
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(9, count);
}

Source File: JavaTC.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}

Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0

5 votes

/**
 * Check base table whether all rows are indexed
 * @param table
 * @param index
 * @return
 * @throws StandardException
 */
private List<String> checkMissingIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    List<String> messages = Lists.newLinkedList();
    SpliceSpark.pushScope(String.format("Check unindexed rows from table %s.%s", schemaName, tableName));
    PairDataSet<String, ExecRow> d2 = table.subtractByKey(index, null);

    JavaPairRDD rdd = ((SparkPairDataSet)d2).rdd;
    missingIndexCount = rdd.count();
    if (missingIndexCount > 0) {
        messages = reportMissingIndexes(rdd, fix);
    }

    return  messages;
}

Source File: HoodieBloomIndex.java From hudi with Apache License 2.0

5 votes

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}

Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0

5 votes

@Test
public void test2() throws IOException {
    List<String> pdbIds = Arrays.asList("5NV3"); // D4
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(12, count);
}

Source File: StructureToProteinDimersTest.java From mmtf-spark with Apache License 2.0

5 votes

@Test
public void test1() throws IOException {
    List<String> pdbIds = Arrays.asList("1I1G"); // D4
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(4, count);
}

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Source File: SparkCubingByLayer.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Source File: SparkColumnCardinality.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME);
    String output = optionsHelper.getOptionValue(OPTION_OUTPUT);
    int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT));

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName);
    //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException"
    conf.set("spark.sql.catalogImplementation", "hive");
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output));
        // table will be loaded by spark sql, so isSequenceFile set false
        final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName);
        JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter())
                .reduceByKey((x, y) -> {
                    x.merge(y);
                    return x;
                })
                .mapToPair(record -> {
                    return new Tuple2<>(record._1, record._2.getCountEstimate());
                })
                .sortByKey(true, 1)
                .cache();

        if (resultRdd.count() == 0) {
            ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>();
            for (int i = 0; i < columnCnt; ++i) {
                list.add(new Tuple2<>(i, 0L));
            }
            JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1);
            nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        } else {
            resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        }
    }
}

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}

Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0

4 votes

@Test
public void testCreateDataFrame() throws Exception {
  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();
  final SparkContext context = session.sparkContext();

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  try {
    // Load RDD from datastore, no filters
    final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions());
    final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD();

    final long count = javaRdd.count();
    LOGGER.warn("DataStore loaded into RDD with " + count + " features.");

    queryRunner.addInputStore(dataStore, null, "features");

    final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))";

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)");

    Dataset<Row> results = queryRunner.run();
    final long containsCount = results.count();
    LOGGER.warn("Got " + containsCount + " for GeomContains test");

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))");
    results = queryRunner.run();
    final long withinCount = results.count();
    LOGGER.warn("Got " + withinCount + " for GeomWithin test");

    Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount);

    // Test the output writer
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore);

    sqlResultsWriter.writeResults("sqltest");

    queryRunner.removeAllStores();

    // Test other spatial UDFs
    final String line1 = "LINESTRING(0 0, 10 10)";
    final String line2 = "LINESTRING(0 10, 10 0)";
    queryRunner.setSql(
        "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    Row result = queryRunner.run().head();

    final boolean intersect = result.getBoolean(0);
    LOGGER.warn("GeomIntersects returned " + intersect);

    Assert.assertTrue("Lines should intersect", intersect);

    queryRunner.setSql(
        "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    result = queryRunner.run().head();

    final boolean disjoint = result.getBoolean(0);
    LOGGER.warn("GeomDisjoint returned " + disjoint);

    Assert.assertFalse("Lines should not be disjoint", disjoint);

  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while testing a bounding box query of spatial index: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}

Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0

4 votes

/**
 * Phase I : pre-processing for RP-DBSCAN.
 * Phase I-1 (Pseudo Random Partitioning) and Phase I-2 (Cell_Dictionary_Building & Broadcasting)
 */
public void phaseI()
{
	/**
	 * Phase I-1. Pseudo Random Partitioning
	 */
	
	//Read input data set from HDFS
	JavaRDD<String> lines = sc.textFile(Conf.inputPath, Conf.numOfPartitions);
	JavaPairRDD<List<Integer>, ApproximatedCell> dataMap = null;
	
	//Data partitioning
	if(Conf.boost)
	{
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon))
		.combineByKey(new Methods.CreateLocalApproximatedPoint(Conf.dim, Conf.epsilon, Conf.rho), new Methods.LocalApproximation(Conf.dim, Conf.epsilon, Conf.rho), new Methods.GlobalApproximation(Conf.dim))
		.mapToPair(new Methods.PseudoRandomPartition2(Conf.metaBlockWindow)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	}else
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)).groupByKey().mapToPair(new Methods.PseudoRandomPartition(Conf.dim, Conf.epsilon, Conf.rho, Conf.metaBlockWindow, Conf.pairOutputPath)).persist(StorageLevel.MEMORY_AND_DISK_SER());

	numOfCells = dataMap.count();

	/**
	 * Phase I-2. Cell_Dictionary_Building & Broadcasting
	 */
	//Dictionary Defragmentation
	JavaPairRDD<List<Integer>, Long> ptsCountforEachMetaBlock = dataMap.mapToPair(new Methods.MetaBlockMergeWithApproximation()).reduceByKey(new Methods.AggregateCount());
	List<Tuple2<List<Integer>, Long>> numOfPtsInCell = ptsCountforEachMetaBlock.collect();
	//System.out.println("# of Blocks for virtually combining : " + numOfPtsInCell.size());
			
	HashMap<List<Integer>,List<Integer>> partitionIndex = new HashMap<List<Integer>,List<Integer>>();
	Tuple2<Long, List<Partition>> metaInfoForVirtualCombining = Methods.scalablePartition(numOfPtsInCell, Conf.dim, Conf.numOflvhCellsInMetaPartition/Conf.dim, partitionIndex);
	numOfSubCells = metaInfoForVirtualCombining._1;
	List<Partition> wholePartitions = metaInfoForVirtualCombining._2;
	numOfSubDictionaries = wholePartitions.size();	
			
	//Build Two-Level Cell Dictionary composed of multiple sub-dictionaries
	JavaPairRDD<Integer, Iterable<ApproximatedCell>> evenlySplitPartitions = dataMap.flatMapToPair(new Methods.AssignApproximatedPointToPartition(partitionIndex)).groupByKey(wholePartitions.size());
	JavaPairRDD<Null, Null> metaDataSet = evenlySplitPartitions.mapToPair(new Methods.MetaGenerationWithApproximation(Conf.dim, Conf.epsilon, Conf.rho, Conf.minPts, conf, wholePartitions));
	metaDataSet.collect();
	
	//Re-partition the pseudo random partitions into Each Worker by a randomly assigned integer value for reducing the size of memory usage.
	dataset = dataMap.mapToPair(new Methods.Repartition(Conf.numOfPartitions)).repartition(Conf.numOfPartitions).persist(StorageLevel.MEMORY_AND_DISK_SER());

	//Broadcast two-level cell dictionary to every workers.
	try {
		metaPaths = FileIO.broadCastData(sc, conf, Conf.metaFoler);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}

Source File: SparkColumnCardinality.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME);
    String output = optionsHelper.getOptionValue(OPTION_OUTPUT);
    int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT));

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName);
    //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException"
    conf.set("spark.sql.catalogImplementation", "hive");
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output));
        // table will be loaded by spark sql, so isSequenceFile set false
        final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName);
        JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter())
                .reduceByKey((x, y) -> {
                    x.merge(y);
                    return x;
                })
                .mapToPair(record -> {
                    return new Tuple2<>(record._1, record._2.getCountEstimate());
                })
                .sortByKey(true, 1)
                .cache();

        if (resultRdd.count() == 0) {
            ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>();
            for (int i = 0; i < columnCnt; ++i) {
                list.add(new Tuple2<>(i, 0L));
            }
            JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1);
            nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        } else {
            resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        }
    }
}

Source File: ModelReplicaSplit.java From OpenDL with Apache License 2.0

2 votes

/**
 * Split the input samples (one each split for one ModelReplica)
 * 
 * @param input
 * @param nrModelReplica
 * @param cache
 * @return
 */
public JavaPairRDD<Integer, List<T>> split(JavaRDD<T> input, int nrModelReplica, SGDTrainConfig config) {
    JavaPairRDD<Integer, List<T>> output = input.map(new SplitModelReplica(nrModelReplica)).groupByKey().persist(config.getMrDataStorage());
    output.count();
    return output;
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#count()