Java Code Examples for org.apache.spark.sql.SparkSession#sparkContext()
The following examples show how to use
org.apache.spark.sql.SparkSession#sparkContext() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MyVariantDataset.java From mmtf-spark with Apache License 2.0 | 8 votes |
/** * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query. * See <a href="http://myvariant.info/docs/">query syntax</a>. * <p> Example: * <pre> * String query = "clinvar.rcv.clinical_significance:pathogenic " * + "OR clinvar.rcv.clinical_significance:likely pathogenic"; * </pre> * * @param uniprotIds list of Uniprot Ids * @param query MyVariant.info query string * @return dataset with variation Ids and Uniprot Ids or null if no data are found * @throws IOException */ public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); // return null if dataset contains no results if (!Arrays.asList(dataset.columns()).contains("hits")) { System.out.println("MyVariantDataset: no matches found"); return null; } return flattenDataset(dataset); }
Example 2
Source File: PolymerInteractionFingerprintDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { SparkSession spark = SparkSession.builder().master("local[*]") .appName(PolymerInteractionFingerprintDemo.class.getSimpleName()).getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); List<String> pdbIds = Arrays.asList("1OHR"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc); // find ASP-ARG salt bridges InteractionFilter filter = new InteractionFilter(); filter.setDistanceCutoff(3.5); filter.setMinInteractions(1); filter.setQueryGroups(true, "ASP"); filter.setQueryAtomNames(true, "OD1", "OD2"); filter.setTargetGroups(true, "ARG"); filter.setTargetAtomNames(true, "NH1", "NH2"); Dataset<Row> interactions = InteractionFingerprinter.getPolymerInteractions(pdb, filter).cache(); interactions.show(false); sc.close(); }
Example 3
Source File: StructureAligner.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Calculates all vs. all structural alignments of protein chains using the * specified alignment algorithm. The input structures must contain single * protein chains. * * @param targets structures containing single protein chains * @param alignmentAlgorithm name of the algorithm * @return dataset with alignment metrics */ public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets, String alignmentAlgorithm) { SparkSession session = SparkSession.builder().getOrCreate(); JavaSparkContext sc = new JavaSparkContext(session.sparkContext()); // create a list of chainName/ C Alpha coordinates List<Tuple2<String, Point3d[]>> chains = targets.mapValues( s -> new ColumnarStructureX(s,true).getcAlphaCoordinates()).collect(); // create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ... JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size()); // calculate structural alignments for all pairs. // broadcast (copy) chains to all worker nodes for efficient processing. // for each pair there can be zero or more solutions, therefore we flatmap the pairs. JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm)); // convert rows to a dataset return session.createDataFrame(rows, getSchema()); }
Example 4
Source File: BuildDataFrameFromScratch2.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String[]> stringAsList = new ArrayList<>(); stringAsList.add(new String[] { "bar1.1", "bar2.1" }); stringAsList.add(new String[] { "bar1.2", "bar2.2" }); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String[] row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes .createStructType(new StructField[] { DataTypes.createStructField( "foe1", DataTypes.StringType, false), DataTypes.createStructField("foe2", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); }
Example 5
Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
RewriteDataFilesAction(SparkSession spark, Table table) { this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.table = table; this.spec = table.spec(); this.filter = Expressions.alwaysTrue(); this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false")); long splitSize = PropertyUtil.propertyAsLong( table.properties(), TableProperties.SPLIT_SIZE, TableProperties.SPLIT_SIZE_DEFAULT); long targetFileSize = PropertyUtil.propertyAsLong( table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); this.targetSizeInBytes = Math.min(splitSize, targetFileSize); this.splitLookback = PropertyUtil.propertyAsInt( table.properties(), TableProperties.SPLIT_LOOKBACK, TableProperties.SPLIT_LOOKBACK_DEFAULT); this.splitOpenFileCost = PropertyUtil.propertyAsLong( table.properties(), TableProperties.SPLIT_OPEN_FILE_COST, TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); this.encryptionManager = table.encryption(); }
Example 6
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
RemoveOrphanFilesAction(SparkSession spark, Table table) { this.spark = spark; this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.ops = ((HasTableOperations) table).operations(); this.location = table.location(); }
Example 7
Source File: PiComputeLambdaWithRddApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
/** * The processing code. */ private void start(int slices) { SparkSession spark = SparkSession .builder() .appName("JavaSparkPi") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); int n = 100000 * slices; List<Integer> l = new ArrayList<>(n); for (int i = 0; i < n; i++) { l.add(i); } JavaRDD<Integer> dataSet = jsc.parallelize(l, slices); int count = dataSet.map(integer -> { double x = Math.random() * 2 - 1; double y = Math.random() * 2 - 1; return (x * x + y * y <= 1) ? 1 : 0; }).reduce((integer, integer2) -> integer + integer2); System.out.println("Pi is roughly " + 4.0 * count / n); spark.stop(); }
Example 8
Source File: JobHelper.java From sylph with Apache License 2.0 | 5 votes |
static Serializable build1xJob(String jobId, EtlFlow flow, URLClassLoader jobClassLoader, ConnectorStore connectorStore) throws Exception { final AtomicBoolean isCompile = new AtomicBoolean(true); final Supplier<StreamingContext> appGetter = (Supplier<StreamingContext> & Serializable) () -> { logger.info("========create spark StreamingContext mode isCompile = " + isCompile.get() + "============"); SparkConf sparkConf = isCompile.get() ? new SparkConf().setMaster("local[*]").setAppName("sparkCompile") : new SparkConf(); //todo: 5s is default SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate(); StreamingContext spark = new StreamingContext(sparkSession.sparkContext(), Seconds.apply(5)); Bean bean = binder -> binder.bind(StreamingContext.class, spark); StreamNodeLoader loader = new StreamNodeLoader(connectorStore, IocFactory.create(bean)); buildGraph(loader, flow); return spark; }; JVMLauncher<Integer> launcher = JVMLaunchers.<Integer>newJvm() .setCallable(() -> { appGetter.get(); return 1; }) .setConsole((line) -> System.out.println(new Ansi().fg(YELLOW).a("[" + jobId + "] ").fg(GREEN).a(line).reset())) .addUserURLClassLoader(jobClassLoader) .notDepThisJvmClassPath() .setClassLoader(jobClassLoader) .build(); launcher.startAndGet(); isCompile.set(false); return (Serializable) appGetter; }
Example 9
Source File: JavaLogQuery.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaLogQuery") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs); JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() { @Override public Tuple2<Tuple3<String, String, String>, Stats> call(String s) { return new Tuple2<>(extractKey(s), extractStats(s)); } }); JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() { @Override public Stats call(Stats stats, Stats stats2) { return stats.merge(stats2); } }); List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect(); for (Tuple2<?,?> t : output) { System.out.println(t._1() + "\t" + t._2()); } spark.stop(); }
Example 10
Source File: JavaStatusTrackerDemo.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) throws Exception { SparkSession spark = SparkSession .builder() .appName(APP_NAME) .getOrCreate(); final JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); // Example of implementing a progress reporter for a simple job. JavaRDD<Integer> rdd = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 5).map( new IdentityWithDelay<Integer>()); JavaFutureAction<List<Integer>> jobFuture = rdd.collectAsync(); while (!jobFuture.isDone()) { Thread.sleep(1000); // 1 second List<Integer> jobIds = jobFuture.jobIds(); if (jobIds.isEmpty()) { continue; } int currentJobId = jobIds.get(jobIds.size() - 1); SparkJobInfo jobInfo = jsc.statusTracker().getJobInfo(currentJobId); SparkStageInfo stageInfo = jsc.statusTracker().getStageInfo(jobInfo.stageIds()[0]); System.out.println(stageInfo.numTasks() + " tasks total: " + stageInfo.numActiveTasks() + " active, " + stageInfo.numCompletedTasks() + " complete"); } System.out.println("Job results are: " + jobFuture.get()); spark.stop(); }
Example 11
Source File: SparkSessionUtil.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
static public SparkSession createSparkSession(){ SparkSession.Builder builder = SparkSession.builder() .appName("test") .master("local[1]") .config("spark.ui.enabled", false); SparkSession sparkSession = builder.getOrCreate(); SparkContext sparkContext = sparkSession.sparkContext(); sparkContext.setLogLevel("ERROR"); return sparkSession; }
Example 12
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a Dataset of pairwise interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom, its interacting neighbor atom, and * the interaction distance is represented as a row. * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter */ public static Dataset<Row> getPairInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = true; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getPairInteractionSchema()); }
Example 13
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset of interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom and its interacting neighbor atoms * are represented as a row in a Dataset. In addition, geometric features * of the interactions, such as distances, angles, and orientational order * parameters are returned in each row (see {@link edu.sdsc.mm.dev.utils.CoordinationGeometry}). * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter * @see edu.sdsc.mm.dev.utils.CoordinationGeometry */ public static Dataset<Row> getInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = false; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getSchema(filter.getMaxInteractions())); }
Example 14
Source File: G2SDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Downloads PDB residue mappings for a list of genomic variations. * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C) * @param pdbId specific PDB structure used for mapping * @param chainId specific chain used for mapping * @return dataset with PDB mapping information * @throws IOException */ private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); dataset.show(); // return null if dataset is empty if (dataset.columns().length == 0) { System.out.println("G2SDataset: no matches found"); return null; } dataset = standardizeData(dataset); return flattenDataset(dataset); }
Example 15
Source File: BuildDataFrameFromScratch.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); }
Example 16
Source File: HoodieJavaStreamingApp.java From hudi with Apache License 2.0 | 4 votes |
/** * * @throws Exception */ public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate(); JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); // folder path clean up and creation, preparing the environment FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); fs.delete(new Path(streamingSourcePath), true); fs.delete(new Path(streamingCheckpointingPath), true); fs.delete(new Path(tablePath), true); fs.mkdirs(new Path(streamingSourcePath)); // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100)); Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); // setup the input for streaming Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath); // start streaming and showing ExecutorService executor = Executors.newFixedThreadPool(2); // thread for spark strucutured streaming Future<Void> streamFuture = executor.submit(() -> { LOG.info("===== Streaming Starting ====="); stream(streamingInput); LOG.info("===== Streaming Ends ====="); return null; }); // thread for adding data to the streaming source and showing results over time Future<Void> showFuture = executor.submit(() -> { LOG.info("===== Showing Starting ====="); show(spark, fs, inputDF1, inputDF2); LOG.info("===== Showing Ends ====="); return null; }); // let the threads run streamFuture.get(); showFuture.get(); executor.shutdown(); }
Example 17
Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0 | 4 votes |
@Test public void testCreateDataFrame() throws Exception { // Set up Spark final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession(); final SparkContext context = session.sparkContext(); // ingest test points TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); final SqlQueryRunner queryRunner = new SqlQueryRunner(); queryRunner.setSparkSession(session); try { // Load RDD from datastore, no filters final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions()); final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD(); final long count = javaRdd.count(); LOGGER.warn("DataStore loaded into RDD with " + count + " features."); queryRunner.addInputStore(dataStore, null, "features"); final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))"; queryRunner.setSql( "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)"); Dataset<Row> results = queryRunner.run(); final long containsCount = results.count(); LOGGER.warn("Got " + containsCount + " for GeomContains test"); queryRunner.setSql( "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))"); results = queryRunner.run(); final long withinCount = results.count(); LOGGER.warn("Got " + withinCount + " for GeomWithin test"); Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount); // Test the output writer final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore); sqlResultsWriter.writeResults("sqltest"); queryRunner.removeAllStores(); // Test other spatial UDFs final String line1 = "LINESTRING(0 0, 10 10)"; final String line2 = "LINESTRING(0 10, 10 0)"; queryRunner.setSql( "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); Row result = queryRunner.run().head(); final boolean intersect = result.getBoolean(0); LOGGER.warn("GeomIntersects returned " + intersect); Assert.assertTrue("Lines should intersect", intersect); queryRunner.setSql( "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); result = queryRunner.run().head(); final boolean disjoint = result.getBoolean(0); LOGGER.warn("GeomDisjoint returned " + disjoint); Assert.assertFalse("Lines should not be disjoint", disjoint); } catch (final Exception e) { e.printStackTrace(); TestUtils.deleteAll(dataStore); Assert.fail( "Error occurred while testing a bounding box query of spatial index: '" + e.getLocalizedMessage() + "'"); } // Clean up TestUtils.deleteAll(dataStore); }
Example 18
Source File: ProcessVendorTrasactions.java From aws-big-data-blog with Apache License 2.0 | 4 votes |
public static void run(String jobInputParam) throws Exception{ List<StructField> schemaFields = new ArrayList<StructField>(); schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(schemaFields); SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys"); SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); String redshiftJDBCURL=props.getProperty("redshift.jdbc.url"); String s3TempPath = props.getProperty("s3.temp.path"); System.out.println("props"+props); JavaRDD<Row> salesRDD = sc.textFile(jobInputParam). map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(","); return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}}); Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema); Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); String[] joinColArray = {"vendor_id","item_id","trans_date"}; vendorItemSaleAmountDF.printSchema(); Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq(); Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer") .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer") .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount"); vendorAggregatedDF.printSchema(); DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain(); AWSSessionCredentials creds = (AWSSessionCredentials) provider.getCredentials(); String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString(); String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp") .append(appendix) .append(" where vendortranssummary.vendor_id=vendortranssummary_temp") .append(appendix) .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp") .append(appendix) .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp") .append(appendix) .append(".trans_date;") .append("insert into vendortranssummary select * from vendortranssummary_temp") .append(appendix) .append(";drop table vendortranssummary_temp") .append(appendix) .append(";end transaction;").toString(); vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL) .option("dbtable", "vendortranssummary_temp"+appendix) .option("usestagingtable","false") .option("postactions",vendorTransSummarySQL) .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId()) .option("temporary_aws_secret_access_key",creds.getAWSSecretKey()) .option("temporary_aws_session_token", creds.getSessionToken()) .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save(); }
Example 19
Source File: StateLessProcessingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 3 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("stateless Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), Durations.milliseconds(1000)); JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999); JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }); //flightDetailsStream.print(); //flightDetailsStream.foreachRDD((VoidFunction<JavaRDD<FlightDetails>>) rdd -> rdd.saveAsTextFile("hdfs://namenode:port/path")); JavaDStream<FlightDetails> window = flightDetailsStream.window(Durations.minutes(5),Durations.minutes(1)); JavaPairDStream<String, Double> transfomedWindow = window.mapToPair(f->new Tuple2<String,Double>(f.getFlightId(),f.getTemperature())). mapValues(t->new Tuple2<Double,Integer>(t,1)) .reduceByKey((t1, t2) -> new Tuple2<Double, Integer>(t1._1()+t2._1(), t1._2()+t2._2())).mapValues(t -> t._1()/t._2()); transfomedWindow.cache(); transfomedWindow.print(); jssc.start(); jssc.awaitTermination(); }
Example 20
Source File: ValueSetUdfs.java From bunsen with Apache License 2.0 | 3 votes |
/** * Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content. * * @param spark the spark session * @param valueSets the valuesets to use in the UDF */ public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) { JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext()); Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets); pushUdf(spark, broadcast); }