org.apache.spark.storage.StorageLevel Java Exaples

Source File: GarmadonSparkStorageStatusListenerIntegrationTest.java From garmadon with Apache License 2.0

6 votes

@Test
public void SparkStorageStatusListener_should_track_rdd_storage_status() throws InterruptedException {
    assertTrue(rdds.isEmpty());
    assertTrue(executors.isEmpty());

    //Memory
    JavaRDD rddMemory = makeRDD("MemRDD", StorageLevel.MEMORY_ONLY());
    rddMemory.collect();

    checkRddStorage(rddMemory.name(), equalTo(0L), greaterThan(0L), equalTo(0L));

    //Disk
    JavaRDD rddDisk = makeRDD("DiskRDD", StorageLevel.DISK_ONLY());
    rddDisk.collect();

    checkRddStorage(rddDisk.name(), equalTo(0L), equalTo(0L), greaterThan(0L));

    //OffHeap
    JavaRDD rddOffHeap = makeRDD("OffHeapRDD", StorageLevel.OFF_HEAP());
    rddOffHeap.collect();

    checkRddStorage(rddOffHeap.name(), greaterThan(0L), equalTo(0L), equalTo(0L));
}

Source File: UnboundedDataset.java From beam with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public void cache(String storageLevel, Coder<?> coder) {
  // we "force" MEMORY storage level in streaming
  if (!StorageLevel.fromString(storageLevel).equals(StorageLevel.MEMORY_ONLY_SER())) {
    LOG.warn(
        "Provided StorageLevel: {} is ignored for streams, using the default level: {}",
        storageLevel,
        StorageLevel.MEMORY_ONLY_SER());
  }
  // Caching can cause Serialization, we need to code to bytes
  // more details in https://issues.apache.org/jira/browse/BEAM-2669
  Coder<WindowedValue<T>> wc = (Coder<WindowedValue<T>>) coder;
  this.dStream =
      dStream.map(CoderHelpers.toByteFunction(wc)).cache().map(CoderHelpers.fromByteFunction(wc));
}

Source File: SparkStreamingPulsarReceiverTest.java From pulsar with Apache License 2.0

6 votes

@Test(dataProvider = "ServiceUrls")
public void testDefaultSettingsOfReceiver(String serviceUrl) {
    ConsumerConfigurationData<byte[]> consConf = new ConsumerConfigurationData<>();

    Set<String> set = new HashSet<>();
    set.add(TOPIC);
    consConf.setTopicNames(set);
    consConf.setSubscriptionName(SUBS);

    SparkStreamingPulsarReceiver receiver = new SparkStreamingPulsarReceiver(
        serviceUrl,
        consConf,
        new AuthenticationDisabled());

    assertEquals(receiver.storageLevel(), StorageLevel.MEMORY_AND_DISK_2());
    assertNotNull(consConf.getMessageListener());
}

Source File: PersistedOutputRDD.java From tinkerpop with Apache License 2.0

6 votes

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
    if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
        LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
    SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
    // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
    final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
    if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
        graphRDD.mapValues(vertex -> {
            vertex.get().dropEdges(Direction.BOTH);
            return vertex;
        }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    else
        graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}

Source File: PathSeqBwaSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs.
 * Header sequence dictionary is reduced to only those that were aligned to.
 */
private void writeBam(final JavaRDD<GATKRead> reads, final String inputBamPath, final boolean isPaired,
                      final JavaSparkContext ctx, SAMFileHeader header) {

    //Only retain header sequences that were aligned to.
    //This invokes an action and therefore the reads must be cached.
    reads.persist(StorageLevel.MEMORY_AND_DISK_SER());
    header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger);

    final String outputPath = isPaired ? outputPaired : outputUnpaired;
    try {
        ReadsSparkSink.writeReads(ctx, outputPath, null, reads, header,
                shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
                PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir, true, splittingIndexGranularity);
    } catch (final IOException e) {
        throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
    }
}

Source File: StorageLevelSerializer.java From deeplearning4j with Apache License 2.0

6 votes

private static Map<StorageLevel, String> initMap() {
    Map<StorageLevel, String> map = new HashMap<>();
    map.put(StorageLevel.NONE(), "NONE");
    map.put(StorageLevel.DISK_ONLY(), "DISK_ONLY");
    map.put(StorageLevel.DISK_ONLY_2(), "DISK_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY(), "MEMORY_ONLY");
    map.put(StorageLevel.MEMORY_ONLY_2(), "MEMORY_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY_SER(), "MEMORY_ONLY_SER");
    map.put(StorageLevel.MEMORY_ONLY_SER_2(), "MEMORY_ONLY_SER_2");
    map.put(StorageLevel.MEMORY_AND_DISK(), "MEMORY_AND_DISK");
    map.put(StorageLevel.MEMORY_AND_DISK_2(), "MEMORY_AND_DISK_2");
    map.put(StorageLevel.MEMORY_AND_DISK_SER(), "MEMORY_AND_DISK_SER");
    map.put(StorageLevel.MEMORY_AND_DISK_SER_2(), "MEMORY_AND_DISK_SER_2");
    map.put(StorageLevel.OFF_HEAP(), "OFF_HEAP");
    return map;
}

Source File: ALSUpdate.java From oryx with Apache License 2.0

6 votes

private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
    JavaPairRDD<String,float[]> javaRDD,
    Broadcast<? extends Map<String,Integer>> bIdToIndex) {

  RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
      new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
  ).mapValues(f -> {
      double[] d = new double[f.length];
      for (int i = 0; i < d.length; i++) {
        d[i] = f[i];
      }
      return d;
    }
  ).rdd();

  // This mimics the persistence level establish by ALS training methods
  scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());

  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
  return objKeyRDD;
}

Source File: TestJsonYaml.java From deeplearning4j with Apache License 2.0

6 votes

@Test
    public void testJsonYaml() {
        TrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(2).batchSizePerWorker(32)
                        .exportDirectory("hdfs://SomeDirectory/").saveUpdater(false).averagingFrequency(3)
                        .storageLevel(StorageLevel.MEMORY_ONLY_SER_2()).storageLevelStreams(StorageLevel.DISK_ONLY())
                        .build();

        String json = tm.toJson();
        String yaml = tm.toYaml();

//        System.out.println(json);

        TrainingMaster fromJson = ParameterAveragingTrainingMaster.fromJson(json);
        TrainingMaster fromYaml = ParameterAveragingTrainingMaster.fromYaml(yaml);


        assertEquals(tm, fromJson);
        assertEquals(tm, fromYaml);

    }

Source File: PersistedInputOutputRDDIntegrateTest.java From tinkerpop with Apache License 2.0

5 votes

@Test
public void shouldPersistRDDBasedOnStorageLevel() throws Exception {
    Spark.create("local[4]");
    int counter = 0;
    for (final String storageLevel : Arrays.asList("MEMORY_ONLY", "DISK_ONLY", "MEMORY_ONLY_SER", "MEMORY_AND_DISK_SER")) {
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
        counter++;
        final String rddName = TestHelper.makeTestDataDirectory(PersistedInputOutputRDDIntegrateTest.class, UUID.randomUUID().toString());
        final Configuration configuration = super.getBaseConfiguration();
        configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, SparkHadoopGraphProvider.PATHS.get("tinkerpop-modern-v3d0.kryo"));
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, GryoInputFormat.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, storageLevel);
        configuration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, rddName);
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
        Graph graph = GraphFactory.open(configuration);
        graph.compute(SparkGraphComputer.class)
                .result(GraphComputer.ResultGraph.NEW)
                .persist(GraphComputer.Persist.EDGES)
                .program(TraversalVertexProgram.build()
                        .traversal(graph.traversal().withComputer(SparkGraphComputer.class),
                                "gremlin-groovy",
                                "g.V().groupCount('m').by('name').out()").create(graph)).submit().get();
        ////////
        assertTrue(Spark.hasRDD(Constants.getGraphLocation(rddName)));
        assertEquals(StorageLevel.fromString(storageLevel), Spark.getRDD(Constants.getGraphLocation(rddName)).getStorageLevel());
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
    }
    Spark.close();
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: KafkaRangeReceiver.java From kafka-spark-consumer with Apache License 2.0

5 votes

public KafkaRangeReceiver(
		KafkaConfig config,
        Set<Integer> partitionSet,
        StorageLevel storageLevel,
        KafkaMessageHandler<E> messageHandler) {
  super(storageLevel);
  this.kafkaConfig = config;
  _partitionSet = partitionSet;
  _messageHandler = messageHandler;
}

Source File: Checkpoint.java From systemds with Apache License 2.0

5 votes

/**
 * This is a utility method because Sparks StorageLevel.toString() is incompatible with its own
 * fromString() method.
 * 
 * @param level RDD storage level
 * @return storage level as a string
 */
public static String getStorageLevelString( StorageLevel level)
{
	if( StorageLevel.NONE().equals(level) )
		return "NONE";
	else if( StorageLevel.MEMORY_ONLY().equals(level) )
		return "MEMORY_ONLY";
	else if( StorageLevel.MEMORY_ONLY_2().equals(level) )
		return "MEMORY_ONLY_2";
	else if( StorageLevel.MEMORY_ONLY_SER().equals(level) )
		return "MEMORY_ONLY_SER";
	else if( StorageLevel.MEMORY_ONLY_SER_2().equals(level) )
		return "MEMORY_ONLY_SER_2";
	else if( StorageLevel.MEMORY_AND_DISK().equals(level) )
		return "MEMORY_AND_DISK";
	else if( StorageLevel.MEMORY_AND_DISK_2().equals(level) )
		return "MEMORY_AND_DISK_2";
	else if( StorageLevel.MEMORY_AND_DISK_SER().equals(level) )
		return "MEMORY_AND_DISK_SER";
	else if( StorageLevel.MEMORY_AND_DISK_SER_2().equals(level) )
		return "MEMORY_AND_DISK_SER_2";
	else if( StorageLevel.DISK_ONLY().equals(level) )
		return "DISK_ONLY";
	else if( StorageLevel.DISK_ONLY_2().equals(level) )
		return "DISK_ONLY_2";
	
	return "INVALID";
}

Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0

5 votes

public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}

Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0

5 votes

/**
 * Phase II : local clustering for RP-DBSCAN.
 * Phase II-1 (Core Marking) and Phase II-2 (Subgraph Building)
 */
public void phaseII()
{
	/**
	 * Phase II-1: Core Marking
	 */
	
	//Mark core cells and core points with the (eps,rho)-region query.
	JavaPairRDD<Long, ApproximatedCell> coreCells = dataset.mapPartitionsToPair(new Methods.FindCorePointsWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	
	//Count the number of core cells
	List<Tuple2<Integer, Long>> numOfCores = coreCells.mapToPair(new Methods.CountCorePts()).reduceByKey(new Methods.AggregateCount()).collect();
	numOfCorePoints = numOfCores.get(0)._2;
	
	//Broadcast core cell ids to every workers for updating the status of edges in cell subgraphs.
	try {
		corePaths = FileIO.broadCastData(sc, conf, Conf.coreInfoFolder);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	/**
	 * Phase II-2: Subgraph Building
	 */
	// Build cell subgraph
	edgeSet = coreCells.mapPartitionsToPair(new Methods.FindDirectDensityReachableEdgesWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths, corePaths ,Conf.numOfPartitions)).repartition(Conf.numOfPartitions/2);
	
}

Source File: PSPairedUnpairedSplitterSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Gets RDDs of the paired and unpaired reads. Option to cache the repartitioned RDD.
 */
public PSPairedUnpairedSplitterSpark(final JavaRDD<GATKRead> reads, final int readsPerPartitionGuess, final boolean cacheReads) {

    //Repartition reads then map each partition to a pair of lists, one containing the paired reads and the
    // other the unpaired reads
    repartitionedReads = PSFilter.repartitionReadsByName(reads)
            .mapPartitions(iter -> mapPartitionsToPairedAndUnpairedLists(iter, readsPerPartitionGuess));
    shouldBeCached = cacheReads;
    if (cacheReads) {
        repartitionedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());
        isCached = true;
    }
}

Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0

5 votes

public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}

Source File: Dataset.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: SparkContextStorageCheck.java From tinkerpop with Apache License 2.0

5 votes

@Test
public void shouldSupportDirectoryFileDistinction() throws Exception {
    final Storage storage = SparkContextStorage.open(graph.configuration());
    for (int i = 0; i < 10; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory1/file1-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    for (int i = 0; i < 5; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory2/file2-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    super.checkFileDirectoryDistinction(storage, "directory1", "directory2");
}

Source File: GlobalWatermarkHolder.java From beam with Apache License 2.0

5 votes

private static Map<Integer, SparkWatermarks> initWatermarks(final BlockManager blockManager) {

    final Map<Integer, SparkWatermarks> watermarks = fetchSparkWatermarks(blockManager);

    if (watermarks == null) {
      final HashMap<Integer, SparkWatermarks> empty = Maps.newHashMap();
      blockManager.putSingle(
          WATERMARKS_BLOCK_ID, empty, StorageLevel.MEMORY_ONLY(), true, WATERMARKS_TAG);
      return empty;
    } else {
      return watermarks;
    }
  }

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

5 votes

public ParameterAveragingTrainingMaster(boolean saveUpdater, Integer numWorkers, int rddDataSetNumExamples,
                int batchSizePerWorker, int averagingFrequency, int aggregationDepth, int prefetchNumBatches,
                Repartition repartition, RepartitionStrategy repartitionStrategy, StorageLevel storageLevel,
                boolean collectTrainingStats) {
    checkArgument(numWorkers > 0, "Invalid number of workers: " + numWorkers + " (must be >= 1)");
    checkArgument(rddDataSetNumExamples > 0,
                    "Invalid rdd data set size: " + rddDataSetNumExamples + " (must be >= 1)");
    checkArgument(averagingFrequency > 0, "Invalid input: averaging frequency must be >= 1");
    checkArgument(aggregationDepth > 0, "Invalid input: tree aggregation channels must be >= 1");

    this.saveUpdater = saveUpdater;
    this.numWorkers = numWorkers;
    this.rddDataSetNumExamples = rddDataSetNumExamples;
    this.batchSizePerWorker = batchSizePerWorker;
    this.averagingFrequency = averagingFrequency;
    this.aggregationDepth = aggregationDepth;
    this.prefetchNumBatches = prefetchNumBatches;
    this.collectTrainingStats = collectTrainingStats;
    this.repartition = repartition;
    this.repartitionStrategy = repartitionStrategy;
    this.storageLevel = storageLevel;
    if (collectTrainingStats)
        stats = new ParameterAveragingTrainingMasterStats.ParameterAveragingTrainingMasterStatsHelper();

    String jvmuid = UIDProvider.getJVMUID();
    this.trainingMasterUID =
                    System.currentTimeMillis() + "_" + (jvmuid.length() <= 8 ? jvmuid : jvmuid.substring(0, 8));
    this.rng = new Random();
}

Source File: HoodieBloomIndex.java From hudi with Apache License 2.0

5 votes

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}

Source File: BaseCommitActionExecutor.java From hudi with Apache License 2.0

5 votes

public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();
  // Cache the tagged records, so we don't end up computing both
  // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
  if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
    inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  } else {
    LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
  }

  WorkloadProfile profile = null;
  if (isWorkloadProfileNeeded()) {
    profile = new WorkloadProfile(inputRecordsRDD);
    LOG.info("Workload profile :" + profile);
    saveWorkloadProfileMetadataToInflight(profile, instantTime);
  }

  // partition using the insert partitioner
  final Partitioner partitioner = getPartitioner(profile);
  JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
  JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
    if (WriteOperationType.isChangingRecords(operationType)) {
      return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
    } else {
      return handleInsertPartition(instantTime, partition, recordItr, partitioner);
    }
  }, true).flatMap(List::iterator);

  updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}

Source File: StorageLevelSerializer.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void serialize(StorageLevel storageLevel, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
                throws IOException, JsonProcessingException {
    //This is a little ugly, but Spark doesn't provide many options here...
    String s = null;
    if (storageLevel != null) {
        s = map.get(storageLevel);
    }
    jsonGenerator.writeString(s);
}

Source File: DataStep.java From envelope with Apache License 2.0

5 votes

public boolean isCached() {
  if (data == null) {
    return false;
  }

  return data.storageLevel() != StorageLevel.NONE();
}

Source File: SparkStreamingPulsarReceiver.java From pulsar with Apache License 2.0

5 votes

public SparkStreamingPulsarReceiver(StorageLevel storageLevel,
    String serviceUrl,
    ConsumerConfigurationData<byte[]> conf,
    Authentication authentication) {
    super(storageLevel);

    checkNotNull(serviceUrl, "serviceUrl must not be null");
    checkNotNull(conf, "ConsumerConfigurationData must not be null");
    checkArgument(conf.getTopicNames().size() > 0, "TopicNames must be set a value.");
    checkNotNull(conf.getSubscriptionName(), "SubscriptionName must not be null");

    this.serviceUrl = serviceUrl;
    this.authentication = authentication;

    if (conf.getMessageListener() == null) {
        conf.setMessageListener((MessageListener<byte[]> & Serializable) (consumer, msg) -> {
            try {
                store(msg.getData());
                consumer.acknowledgeAsync(msg);
            } catch (Exception e) {
                LOG.error("Failed to store a message : {}", e.getMessage());
                consumer.negativeAcknowledge(msg);
            }
        });
    }
    this.conf = conf;
}

Source File: TieredSpatialJoin.java From geowave with Apache License 2.0

5 votes

private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}

Source File: StringListReceiver.java From cxf with Apache License 2.0

4 votes

public StringListReceiver(List<String> inputStrings) {
    super(StorageLevel.MEMORY_ONLY());
    this.inputStrings = inputStrings;
}

Source File: Throughput.java From flink-perf with Apache License 2.0

4 votes

public Source(StorageLevel storageLevel) {
	super(storageLevel);
	payload = new byte[12];
}

Source File: SampleConsumer.java From kafka-spark-consumer with Apache License 2.0

4 votes

@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "zkhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "topicA,topicB,topicC");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  props.put("zookeeper.broker.path", "/brokers");
  props.put("zookeeper.consumer.path", "/consumers");
  props.put("consumer.forcefromstart", "false");
  props.put("max.poll.records", "10");
  props.put("consumer.fillfreqms", "500");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "kafkahost-1:6667,"
          + "kafkahost-2:6667,"
          + "kafkahost-3:6667,"
          + "kafkahost-4:6667");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 6;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {
      //Start Application Logic
      rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
          @Override
          public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
              int countTopicA = 0;
              int countTopicB = 0;
              int countTopicC = 0;
              while(mmItr.hasNext()) {
                  MessageAndMetadata<byte[]> mm = mmItr.next();
                  if(mm.getTopic().equals("topicA")) {
                      countTopicA++;
                  }
                  else if (mm.getTopic().equals("topicB")) {
                      countTopicB++;
                  }
                  else if (mm.getTopic().equals("topicC")) {
                      countTopicC++;
                  }
              }
              System.out.println("topicA count " + countTopicA);
              System.out.println("topicB count " + countTopicB);
              System.out.println("topicC count " + countTopicC);
          }
      });
      System.out.println("RDD count " + rdd.count());
      //End Application Logic
      //commit offset
      System.out.println("Commiting Offset");
      ProcessedOffsetManager.persistsPartition(rdd, props);
    }
  });

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}

Source File: ReceiverLauncher.java From kafka-spark-consumer with Apache License 2.0

4 votes

public static DStream<MessageAndMetadata<byte[]>> launch(
    StreamingContext ssc, Properties pros, int numberOfReceivers, StorageLevel storageLevel) {
  JavaStreamingContext jsc = new JavaStreamingContext(ssc);
  return createStream(jsc, pros, numberOfReceivers, storageLevel, new IdentityMessageHandler()).dstream();
}

org.apache.spark.storage.StorageLevel Java Examples