org.apache.spark.SparkContext Java Examples
The following examples show how to use
org.apache.spark.SparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JobUtil.java From spark-llap with Apache License 2.0 | 7 votes |
public static JobConf createJobConf(Map<String, String> options, String queryString) { JobConf jobConf = new JobConf(SparkContext.getOrCreate().hadoopConfiguration()); jobConf.set("hive.llap.zk.registry.user", "hive"); jobConf.set("llap.if.hs2.connection", HWConf.RESOLVED_HS2_URL.getFromOptionsMap(options)); if (queryString != null) { jobConf.set("llap.if.query", queryString); } jobConf.set("llap.if.user", HWConf.USER.getFromOptionsMap(options)); jobConf.set("llap.if.pwd", HWConf.PASSWORD.getFromOptionsMap(options)); if (options.containsKey("default.db")) { jobConf.set("llap.if.database", HWConf.DEFAULT_DB.getFromOptionsMap(options)); } if (!options.containsKey("handleid")) { String handleId = UUID.randomUUID().toString(); options.put("handleid", handleId); } jobConf.set("llap.if.handleid", options.get("handleid")); return jobConf; }
Example #2
Source File: BigQuerySparkSQL.java From spark-on-k8s-gcp-examples with Apache License 2.0 | 6 votes |
private static BigQuerySQLContext createBigQuerySQLContext(String[] args) { String projectId = args[0]; Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId), "GCP project ID must not be empty"); String gcsBucket = args[1]; Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket), "GCS bucket must not be empty"); String serviceAccountJsonKeyFilePath = System.getenv(APPLICATION_CREDENTIALS_ENV); Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath), APPLICATION_CREDENTIALS_ENV + " must be set"); SQLContext sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate()); BigQuerySQLContext bigQuerySQLContext = new BigQuerySQLContext(sqlContext); bigQuerySQLContext.setBigQueryProjectId(projectId); bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket); bigQuerySQLContext.setGcpJsonKeyFile(serviceAccountJsonKeyFilePath); return bigQuerySQLContext; }
Example #3
Source File: DataStreamLoaderExample.java From toolbox with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { SparkConf conf = new SparkConf().setAppName("SLink!").setMaster("local"); SparkContext sc = new SparkContext(conf); SQLContext sqlContext = new SQLContext(sc); //Path to dataset String path ="datasets/simulated/WI_samples.json"; //Create an AMIDST object for managing the data DataSpark dataSpark = DataSparkLoader.open(sqlContext, path); //Print all the instances in the dataset dataSpark.collectDataStream() .forEach( dataInstance -> System.out.println(dataInstance) ); }
Example #4
Source File: SparkMaster.java From GeoTriples with Apache License 2.0 | 6 votes |
/** * Convert the input Dataset into RDF triples and store the results. * The conversion is taking place per Partitions using the mapPartition Spark transformation. * @param mapping_list list of TripleMaps */ private void convert_partition(ArrayList<TriplesMap> mapping_list){ SparkContext sc = SparkContext.getOrCreate(); Pair<ArrayList<TriplesMap>, List<String>> transformation_info = new Pair<>(mapping_list, Arrays.asList(reader.getHeaders())); ClassTag<Pair<ArrayList<TriplesMap>, List<String>>> classTag_pair = scala.reflect.ClassTag$.MODULE$.apply(Pair.class); Broadcast<Pair<ArrayList<TriplesMap>, List<String>>> bd_info = sc.broadcast(transformation_info, classTag_pair); rowRDD .mapPartitions( (Iterator<Row> rows_iter) -> { ArrayList<TriplesMap> p_mapping_list = bd_info.value().getKey(); List<String> p_header = bd_info.value().getValue(); RML_Converter rml_converter = new RML_Converter(p_mapping_list, p_header); rml_converter.start(); rml_converter.registerFunctions(); Iterator<String> triples = rml_converter.convertPartition(rows_iter); rml_converter.stop(); return triples; }) .saveAsTextFile(outputDir); }
Example #5
Source File: LensAPI.java From cognition with Apache License 2.0 | 6 votes |
/** * Helper method for creating the spark context from the given cognition configuration * @return a new configured spark context */ public SparkContext createSparkContext() { SparkConf conf = new SparkConf(); Configuration config = cognition.getProperties(); conf.set("spark.serializer", KryoSerializer.class.getName()); conf.setAppName(config.getString("app.name")); conf.setMaster(config.getString("master")); Iterator<String> iterator = config.getKeys("spark"); while (iterator.hasNext()) { String key = iterator.next(); conf.set(key, config.getString(key)); } SparkContext sc = new SparkContext(conf); for (String jar : config.getStringArray("jars")) { sc.addJar(jar); } return sc; }
Example #6
Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0 | 6 votes |
public static GeoWaveIndexedRDD loadIndexedRDD( final SparkContext sc, final DataStorePluginOptions storeOptions, final RDDOptions rddOpts, final NumericIndexStrategy indexStrategy) throws IOException { final GeoWaveRDD wrappedRDD = GeoWaveRDDLoader.loadRDD(sc, storeOptions, rddOpts); // Index strategy can be expensive so we will broadcast it and store it Broadcast<NumericIndexStrategy> broadcastStrategy = null; if (indexStrategy != null) { broadcastStrategy = (Broadcast<NumericIndexStrategy>) RDDUtils.broadcastIndexStrategy(sc, indexStrategy); } final GeoWaveIndexedRDD returnRDD = new GeoWaveIndexedRDD(wrappedRDD, broadcastStrategy); return returnRDD; }
Example #7
Source File: SourceRDD.java From beam with Apache License 2.0 | 6 votes |
public Bounded( SparkContext sc, BoundedSource<T> source, SerializablePipelineOptions options, String stepName) { super(sc, NIL, JavaSparkContext$.MODULE$.fakeClassTag()); this.source = source; this.options = options; // the input parallelism is determined by Spark's scheduler backend. // when running on YARN/SparkDeploy it's the result of max(totalCores, 2). // when running on Mesos it's 8. // when running local it's the total number of cores (local = 1, local[N] = N, // local[*] = estimation of the machine's cores). // ** the configuration "spark.default.parallelism" takes precedence over all of the above ** this.numPartitions = sc.defaultParallelism(); checkArgument(this.numPartitions > 0, "Number of partitions must be greater than zero."); this.bundleSize = options.get().as(SparkPipelineOptions.class).getBundleSize(); this.stepName = stepName; this.metricsAccum = MetricsAccumulator.getInstance(); }
Example #8
Source File: SparkTextFileBoundedSourceVertex.java From incubator-nemo with Apache License 2.0 | 6 votes |
/** * Constructor. * * @param sparkContext the spark context. * @param inputPath the path of the target text file. * @param numPartitions the number of partitions. */ public SparkTextFileBoundedSourceVertex(final SparkContext sparkContext, final String inputPath, final int numPartitions) { this.readables = new ArrayList<>(); final Partition[] partitions = sparkContext.textFile(inputPath, numPartitions).getPartitions(); for (int i = 0; i < partitions.length; i++) { readables.add(new SparkTextFileBoundedSourceReadable( partitions[i], sparkContext.getConf(), i, inputPath, numPartitions)); } this.estimatedSizeBytes = SizeEstimator.estimate(sparkContext.textFile(inputPath, numPartitions)); }
Example #9
Source File: MizoRDD.java From mizo with Apache License 2.0 | 5 votes |
public MizoRDD(SparkContext context, IMizoRDDConfig config, ClassTag<TReturn> classTag) { super(context, new ArrayBuffer<>(), classTag); if (!Strings.isNullOrEmpty(config.logConfigPath())) { PropertyConfigurator.configure(config.logConfigPath()); } this.config = config; this.regionsPaths = getRegionsPaths(config.regionDirectoriesPath()); this.relationTypes = loadRelationTypes(config.titanConfigPath()); }
Example #10
Source File: ConverterFactory.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
static public void checkVersion(){ SparkContext sparkContext = SparkContext.getOrCreate(); int[] version = parseVersion(sparkContext.version()); if(!Arrays.equals(ConverterFactory.VERSION, version)){ throw new IllegalArgumentException("Expected Apache Spark ML version " + formatVersion(ConverterFactory.VERSION) + ", got version " + formatVersion(version) + " (" + sparkContext.version() + ")"); } }
Example #11
Source File: JobFactoryTest.java From rdf2x with Apache License 2.0 | 5 votes |
@Test public void testGetPersistJob() throws ConfigurationException { Runnable job = JobFactory.getJob(new String[]{"convert", "--input.file", "test.nq", "--output.target", "Preview"}); // stop the created Spark Context to avoid conflicts in other tests SparkContext.getOrCreate().stop(); assertNotNull("Non-null write job returned from factory", job); assertEquals("Correct job returned from factory", ConvertJob.class, job.getClass()); }
Example #12
Source File: Spark.java From tinkerpop with Apache License 2.0 | 5 votes |
public static SparkContext recreateStopped() { if (null == CONTEXT) throw new IllegalStateException("The Spark context has not been created."); if (!CONTEXT.isStopped()) throw new IllegalStateException("The Spark context is not stopped."); CONTEXT = SparkContext.getOrCreate(CONTEXT.getConf()); return CONTEXT; }
Example #13
Source File: CassandraDependenciesJob.java From zipkin-dependencies with Apache License 2.0 | 5 votes |
public void run() { long microsLower = day * 1000; long microsUpper = (day * 1000) + TimeUnit.DAYS.toMicros(1) - 1; log.info("Running Dependencies job for {}: {} ≤ Span.timestamp {}", dateStamp, microsLower, microsUpper); SparkContext sc = new SparkContext(conf); List<DependencyLink> links = javaFunctions(sc) .cassandraTable(keyspace, "traces") .spanBy(r -> r.getLong("trace_id"), Long.class) .flatMapValues(new CassandraRowsToDependencyLinks(logInitializer, microsLower, microsUpper)) .values() .mapToPair(l -> Tuple2.apply(Tuple2.apply(l.parent(), l.child()), l)) .reduceByKey((l, r) -> DependencyLink.newBuilder() .parent(l.parent()) .child(l.child()) .callCount(l.callCount() + r.callCount()) .errorCount(l.errorCount() + r.errorCount()) .build()) .values() .collect(); sc.stop(); saveToCassandra(links); }
Example #14
Source File: Spark.java From tinkerpop with Apache License 2.0 | 5 votes |
public static SparkContext create(final SparkConf sparkConf) { if (isContextNullOrStopped()) { sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); CONTEXT = SparkContext.getOrCreate(sparkConf); } return CONTEXT; }
Example #15
Source File: SparkJavaRDD.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Static method to create a SparkJavaRDD object from an text file. * * @param sparkContext the spark context containing configurations. * @param minPartitions the minimum number of partitions. * @param inputPath the path of the input text file. * @return the new SparkJavaRDD object */ public static SparkJavaRDD<String> of(final SparkContext sparkContext, final int minPartitions, final String inputPath) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final org.apache.spark.rdd.RDD<String> textRdd = sparkContext.textFile(inputPath, minPartitions); final int numPartitions = textRdd.getNumPartitions(); final IRVertex textSourceVertex = new SparkTextFileBoundedSourceVertex(sparkContext, inputPath, numPartitions); textSourceVertex.setProperty(ParallelismProperty.of(numPartitions)); builder.addVertex(textSourceVertex); return new SparkJavaRDD<>(textRdd, sparkContext, builder.buildWithoutSourceSinkCheck(), textSourceVertex); }
Example #16
Source File: PrecisionMetric.java From predictionio-template-java-ecom-recommender with Apache License 2.0 | 5 votes |
@Override public Double calculate(SparkContext sc, Seq<Tuple2<EmptyParams, RDD<Tuple3<Query, PredictedResult, Set<String>>>>> qpas) { List<Tuple2<EmptyParams, RDD<Tuple3<Query, PredictedResult, Set<String>>>>> sets = JavaConversions.seqAsJavaList(qpas); List<Double> allSetResults = new ArrayList<>(); for (Tuple2<EmptyParams, RDD<Tuple3<Query, PredictedResult, Set<String>>>> set : sets) { List<Double> setResults = set._2().toJavaRDD().map(new Function<Tuple3<Query, PredictedResult, Set<String>>, Double>() { @Override public Double call(Tuple3<Query, PredictedResult, Set<String>> qpa) throws Exception { Set<String> predicted = new HashSet<>(); for (ItemScore itemScore : qpa._2().getItemScores()) { predicted.add(itemScore.getItemEntityId()); } Set<String> intersection = new HashSet<>(predicted); intersection.retainAll(qpa._3()); return 1.0 * intersection.size() / qpa._2().getItemScores().size(); } }).collect(); allSetResults.addAll(setResults); } double sum = 0.0; for (Double value : allSetResults) sum += value; return sum / allSetResults.size(); }
Example #17
Source File: Model.java From predictionio-template-java-ecom-recommender with Apache License 2.0 | 5 votes |
public static Model load(String id, Params params, SparkContext sc) { JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); JavaPairRDD<Integer, double[]> userFeatures = JavaPairRDD.<Integer, double[]>fromJavaRDD(jsc.<Tuple2<Integer, double[]>>objectFile("/tmp/" + id + "/userFeatures")); JavaPairRDD<Integer, Tuple2<String, double[]>> indexItemFeatures = JavaPairRDD.<Integer, Tuple2<String, double[]>>fromJavaRDD(jsc.<Tuple2<Integer, Tuple2<String, double[]>>>objectFile("/tmp/" + id + "/indexItemFeatures")); JavaPairRDD<String, Integer> userIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/userIndex")); JavaPairRDD<String, Integer> itemIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/itemIndex")); JavaRDD<ItemScore> itemPopularityScore = jsc.objectFile("/tmp/" + id + "/itemPopularityScore"); Map<String, Item> items = jsc.<Map<String, Item>>objectFile("/tmp/" + id + "/items").collect().get(0); logger.info("loaded model"); return new Model(userFeatures, indexItemFeatures, userIndex, itemIndex, itemPopularityScore, items); }
Example #18
Source File: SparkJavaRDD.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Constructor with Spark source RDD. * * @param sparkRDD the Spark source rdd to wrap. * @param sparkContext the Spark context in the wrapped rdd. * @param dag the IR DAG in construction. * @param lastVertex the last vertex of the DAG in construction. */ SparkJavaRDD(final org.apache.spark.rdd.RDD<T> sparkRDD, final SparkContext sparkContext, final DAG<IRVertex, IREdge> dag, final IRVertex lastVertex) { super(sparkRDD, ClassTag$.MODULE$.apply(Object.class)); this.rdd = new RDD<>(sparkContext, dag, lastVertex, Option.apply(sparkRDD), ClassTag$.MODULE$.apply(Object.class)); }
Example #19
Source File: MizoBuilder.java From mizo with Apache License 2.0 | 5 votes |
public MizoRDD<MizoEdge> edgesRDD(SparkContext sc) { return new MizoRDD<MizoEdge>(sc, this, ClassManifestFactory.classType(MizoEdge.class)) { @Override public scala.collection.Iterator<MizoEdge> createRegionIterator(Iterator<IMizoRelationParser> relationsIterator) { return new MizoEdgesIterator(relationsIterator, this.config); } }; }
Example #20
Source File: RDDUtils.java From geowave with Apache License 2.0 | 5 votes |
/** * Translate a set of objects in a JavaRDD to a provided type and push to GeoWave * * @throws IOException */ private static void writeToGeoWave( final SparkContext sc, final Index index, final DataStorePluginOptions outputStoreOptions, final DataTypeAdapter adapter, final JavaRDD<SimpleFeature> inputRDD) throws IOException { // setup the configuration and the output format final Configuration conf = new org.apache.hadoop.conf.Configuration(sc.hadoopConfiguration()); GeoWaveOutputFormat.setStoreOptions(conf, outputStoreOptions); GeoWaveOutputFormat.addIndex(conf, index); GeoWaveOutputFormat.addDataAdapter(conf, adapter); // create the job final Job job = new Job(conf); job.setOutputKeyClass(GeoWaveOutputKey.class); job.setOutputValueClass(SimpleFeature.class); job.setOutputFormatClass(GeoWaveOutputFormat.class); // broadcast string names final ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class); final Broadcast<String> typeName = sc.broadcast(adapter.getTypeName(), stringTag); final Broadcast<String> indexName = sc.broadcast(index.getName(), stringTag); // map to a pair containing the output key and the output value inputRDD.mapToPair( feat -> new Tuple2<>( new GeoWaveOutputKey(typeName.value(), indexName.value()), feat)).saveAsNewAPIHadoopDataset(job.getConfiguration()); }
Example #21
Source File: ParameterAveragingTrainingWorkerStats.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void exportStatFiles(String outputPath, SparkContext sc) throws IOException { String d = DEFAULT_DELIMITER; //Broadcast get time: StatsUtils.exportStats(parameterAveragingWorkerBroadcastGetValueTimeMs, outputPath, FILENAME_BROADCAST_GET_STATS, d, sc); //Network init time: StatsUtils.exportStats(parameterAveragingWorkerInitTimeMs, outputPath, FILENAME_INIT_STATS, d, sc); //Network fit time: StatsUtils.exportStats(parameterAveragingWorkerFitTimesMs, outputPath, FILENAME_FIT_STATS, d, sc); }
Example #22
Source File: RDDUtils.java From geowave with Apache License 2.0 | 5 votes |
public static Broadcast<? extends NumericIndexStrategy> broadcastIndexStrategy( final SparkContext sc, final NumericIndexStrategy indexStrategy) { final ClassTag<NumericIndexStrategy> indexClassTag = scala.reflect.ClassTag$.MODULE$.apply(indexStrategy.getClass()); final Broadcast<NumericIndexStrategy> broadcastStrategy = sc.broadcast(indexStrategy, indexClassTag); return broadcastStrategy; }
Example #23
Source File: SparkSessionUtil.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
static public SparkSession createSparkSession(){ SparkSession.Builder builder = SparkSession.builder() .appName("test") .master("local[1]") .config("spark.ui.enabled", false); SparkSession sparkSession = builder.getOrCreate(); SparkContext sparkContext = sparkSession.sparkContext(); sparkContext.setLogLevel("ERROR"); return sparkSession; }
Example #24
Source File: SparkTestBase.java From spark-transformers with Apache License 2.0 | 5 votes |
@Before public void setup() { SparkConf sparkConf = new SparkConf(); String master = "local[2]"; sparkConf.setMaster(master); sparkConf.setAppName("Local Spark Unit Test"); sc = new JavaSparkContext(new SparkContext(sparkConf)); sqlContext = new SQLContext(sc); }
Example #25
Source File: DataframeCheckpointApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkConf conf = new SparkConf() .setAppName("Checkpoint") .setMaster("local[*]"); SparkContext sparkContext = new SparkContext(conf); // We need to specify where Spark will save the checkpoint file. It can // be // an HDFS location. sparkContext.setCheckpointDir("/tmp"); SparkSession spark = SparkSession.builder() .appName("Checkpoint") .master("local[*]") .getOrCreate(); String filename = "data/tuple-data-file.csv"; Dataset<Row> df1 = spark.read().format("csv").option("inferSchema", "true") .option("header", "false") .load(filename); System.out.println("DF #1 - step #1: simple dump of the dataframe"); df1.show(); System.out.println("DF #2 - step #2: same as DF #1 - step #1"); Dataset<Row> df2 = df1.checkpoint(false); df2.show(); df1 = df1.withColumn("x", df1.col("_c0")); System.out.println( "DF #1 - step #2: new column x, which is the same as _c0"); df1.show(); System.out.println("DF #2 - step #2: no operation was done on df2"); df2.show(); }
Example #26
Source File: TestStreamingStep.java From envelope with Apache License 2.0 | 5 votes |
public JavaRDD<String> generateRDD() { Random values = new Random(); values.setSeed(System.currentTimeMillis()); List<String> list = Lists.newLinkedList(); for (int i = 0; i < batchSize; i++) { list.add(String.valueOf(values.nextLong())); } SparkContext sc = Contexts.getSparkSession().sparkContext(); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); return jsc.parallelize(list,this.partitions); }
Example #27
Source File: RDD.java From nemo with Apache License 2.0 | 5 votes |
/** * Constructor. * @param sparkContext spark context containing configurations. * @param dag the current DAG. */ private RDD(final SparkContext sparkContext, final DAG<IRVertex, IREdge> dag) { super(sparkContext, null, ClassTag$.MODULE$.apply((Class<T>) Object.class)); this.loopVertexStack = new Stack<>(); this.dag = dag; }
Example #28
Source File: SparkFrontendUtils.java From nemo with Apache License 2.0 | 5 votes |
/** * Derive Spark serializer from a spark context. * @param sparkContext spark context to derive the serializer from. * @return the serializer. */ public static Serializer deriveSerializerFrom(final SparkContext sparkContext) { if (sparkContext.conf().get("spark.serializer", "") .equals("org.apache.spark.serializer.KryoSerializer")) { return new KryoSerializer(sparkContext.conf()); } else { return new JavaSerializer(sparkContext.conf()); } }
Example #29
Source File: JavaRDD.java From nemo with Apache License 2.0 | 5 votes |
/** * Static method to create a JavaRDD object from an iterable object. * @param sparkContext spark context containing configurations. * @param initialData initial data. * @param parallelism parallelism information. * @param <T> type of the resulting object. * @return the new JavaRDD object. */ public static <T> JavaRDD<T> of(final SparkContext sparkContext, final Iterable<T> initialData, final Integer parallelism) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final IRVertex initializedSourceVertex = new InitializedSourceVertex<>(initialData); initializedSourceVertex.setProperty(ParallelismProperty.of(parallelism)); builder.addVertex(initializedSourceVertex); return new JavaRDD<>(sparkContext, builder.buildWithoutSourceSinkCheck(), initializedSourceVertex); }
Example #30
Source File: JavaRDD.java From nemo with Apache License 2.0 | 5 votes |
/** * Constructor. * @param sparkContext spark context containing configurations. * @param dag the current DAG. * @param lastVertex last vertex added to the builder. */ JavaRDD(final SparkContext sparkContext, final DAG<IRVertex, IREdge> dag, final IRVertex lastVertex) { // TODO #366: resolve while implementing scala RDD. super(RDD.of(sparkContext), ClassTag$.MODULE$.apply((Class<T>) Object.class)); this.loopVertexStack = new Stack<>(); this.sparkContext = sparkContext; this.dag = dag; this.lastVertex = lastVertex; this.serializer = SparkFrontendUtils.deriveSerializerFrom(sparkContext); }