Java Code Examples for org.apache.spark.sql.SparkSession#sparkContext()

The following examples show how to use org.apache.spark.sql.SparkSession#sparkContext() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MyVariantDataset.java    From mmtf-spark with Apache License 2.0 8 votes vote down vote up
/**
 * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query.
 * See <a href="http://myvariant.info/docs/">query syntax</a>.
 * <p> Example:
 * <pre>
 * String query = "clinvar.rcv.clinical_significance:pathogenic " 
 *                + "OR clinvar.rcv.clinical_significance:likely pathogenic";
 * </pre>
 * 
 * @param uniprotIds list of Uniprot Ids
 * @param query MyVariant.info query string
 * @return dataset with variation Ids and Uniprot Ids or null if no data are found
 * @throws IOException
 */
public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING());

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData);

    // return null if dataset contains no results
    if (!Arrays.asList(dataset.columns()).contains("hits")) {
        System.out.println("MyVariantDataset: no matches found");
        return null;
    }

    return flattenDataset(dataset);
}
 
Example 2
Source File: PolymerInteractionFingerprintDemo.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {		
	SparkSession spark = SparkSession.builder().master("local[*]")
			.appName(PolymerInteractionFingerprintDemo.class.getSimpleName()).getOrCreate();

	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

       List<String> pdbIds = Arrays.asList("1OHR");
       JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);
       
       // find ASP-ARG salt bridges
       InteractionFilter filter = new InteractionFilter();
       filter.setDistanceCutoff(3.5);
       filter.setMinInteractions(1);
       filter.setQueryGroups(true, "ASP");
       filter.setQueryAtomNames(true, "OD1", "OD2");
       filter.setTargetGroups(true, "ARG");
       filter.setTargetAtomNames(true, "NH1", "NH2");
	
	Dataset<Row> interactions = InteractionFingerprinter.getPolymerInteractions(pdb, filter).cache();
       interactions.show(false);

	sc.close();
}
 
Example 3
Source File: StructureAligner.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Calculates all vs. all structural alignments of protein chains using the 
 * specified alignment algorithm. The input structures must contain single 
 * protein chains.
 * 
 * @param targets structures containing single protein chains
 * @param alignmentAlgorithm name of the algorithm
 * @return dataset with alignment metrics
 */
public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets,
		String alignmentAlgorithm) {

	SparkSession session = SparkSession.builder().getOrCreate();
	JavaSparkContext sc = new JavaSparkContext(session.sparkContext());

	// create a list of chainName/ C Alpha coordinates
	List<Tuple2<String, Point3d[]>> chains  = targets.mapValues(
			s -> new ColumnarStructureX(s,true).getcAlphaCoordinates()).collect();

	// create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ...
	JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size());
	
	// calculate structural alignments for all pairs.
	// broadcast (copy) chains to all worker nodes for efficient processing.
	// for each pair there can be zero or more solutions, therefore we flatmap the pairs.
	JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm));

	// convert rows to a dataset
	return session.createDataFrame(rows, getSchema());
}
 
Example 4
Source File: BuildDataFrameFromScratch2.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String[]> stringAsList = new ArrayList<>();
  stringAsList.add(new String[] { "bar1.1", "bar2.1" });
  stringAsList.add(new String[] { "bar1.2", "bar2.2" });

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String[] row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField(
          "foe1", DataTypes.StringType, false),
          DataTypes.createStructField("foe2", DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}
 
Example 5
Source File: RewriteDataFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
RewriteDataFilesAction(SparkSession spark, Table table) {
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.table = table;
  this.spec = table.spec();
  this.filter = Expressions.alwaysTrue();
  this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false"));

  long splitSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_SIZE,
      TableProperties.SPLIT_SIZE_DEFAULT);
  long targetFileSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetSizeInBytes = Math.min(splitSize, targetFileSize);

  this.splitLookback = PropertyUtil.propertyAsInt(
      table.properties(),
      TableProperties.SPLIT_LOOKBACK,
      TableProperties.SPLIT_LOOKBACK_DEFAULT);
  this.splitOpenFileCost = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_OPEN_FILE_COST,
      TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT);

  this.fileIO = SparkUtil.serializableFileIO(table);
  this.encryptionManager = table.encryption();
}
 
Example 6
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
RemoveOrphanFilesAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf());
  this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism();
  this.table = table;
  this.ops = ((HasTableOperations) table).operations();
  this.location = table.location();
}
 
Example 7
Source File: PiComputeLambdaWithRddApp.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
/**
 * The processing code.
 */
private void start(int slices) {
  SparkSession spark = SparkSession
      .builder()
      .appName("JavaSparkPi")
      .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  int n = 100000 * slices;
  List<Integer> l = new ArrayList<>(n);
  for (int i = 0; i < n; i++) {
    l.add(i);
  }

  JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);

  int count = dataSet.map(integer -> {
    double x = Math.random() * 2 - 1;
    double y = Math.random() * 2 - 1;
    return (x * x + y * y <= 1) ? 1 : 0;
  }).reduce((integer, integer2) -> integer + integer2);

  System.out.println("Pi is roughly " + 4.0 * count / n);

  spark.stop();
}
 
Example 8
Source File: JobHelper.java    From sylph with Apache License 2.0 5 votes vote down vote up
static Serializable build1xJob(String jobId, EtlFlow flow, URLClassLoader jobClassLoader, ConnectorStore connectorStore)
        throws Exception
{
    final AtomicBoolean isCompile = new AtomicBoolean(true);
    final Supplier<StreamingContext> appGetter = (Supplier<StreamingContext> & Serializable) () -> {
        logger.info("========create spark StreamingContext mode isCompile = " + isCompile.get() + "============");
        SparkConf sparkConf = isCompile.get() ?
                new SparkConf().setMaster("local[*]").setAppName("sparkCompile")
                : new SparkConf();
        //todo: 5s is default
        SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
        StreamingContext spark = new StreamingContext(sparkSession.sparkContext(), Seconds.apply(5));

        Bean bean = binder -> binder.bind(StreamingContext.class, spark);
        StreamNodeLoader loader = new StreamNodeLoader(connectorStore, IocFactory.create(bean));
        buildGraph(loader, flow);
        return spark;
    };

    JVMLauncher<Integer> launcher = JVMLaunchers.<Integer>newJvm()
            .setCallable(() -> {
                appGetter.get();
                return 1;
            })
            .setConsole((line) -> System.out.println(new Ansi().fg(YELLOW).a("[" + jobId + "] ").fg(GREEN).a(line).reset()))
            .addUserURLClassLoader(jobClassLoader)
            .notDepThisJvmClassPath()
            .setClassLoader(jobClassLoader)
            .build();
    launcher.startAndGet();
    isCompile.set(false);
    return (Serializable) appGetter;
}
 
Example 9
Source File: JavaLogQuery.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}
 
Example 10
Source File: JavaStatusTrackerDemo.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  SparkSession spark = SparkSession
    .builder()
    .appName(APP_NAME)
    .getOrCreate();

  final JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  // Example of implementing a progress reporter for a simple job.
  JavaRDD<Integer> rdd = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 5).map(
      new IdentityWithDelay<Integer>());
  JavaFutureAction<List<Integer>> jobFuture = rdd.collectAsync();
  while (!jobFuture.isDone()) {
    Thread.sleep(1000);  // 1 second
    List<Integer> jobIds = jobFuture.jobIds();
    if (jobIds.isEmpty()) {
      continue;
    }
    int currentJobId = jobIds.get(jobIds.size() - 1);
    SparkJobInfo jobInfo = jsc.statusTracker().getJobInfo(currentJobId);
    SparkStageInfo stageInfo = jsc.statusTracker().getStageInfo(jobInfo.stageIds()[0]);
    System.out.println(stageInfo.numTasks() + " tasks total: " + stageInfo.numActiveTasks() +
        " active, " + stageInfo.numCompletedTasks() + " complete");
  }

  System.out.println("Job results are: " + jobFuture.get());
  spark.stop();
}
 
Example 11
Source File: SparkSessionUtil.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
static
public SparkSession createSparkSession(){
	SparkSession.Builder builder = SparkSession.builder()
		.appName("test")
		.master("local[1]")
		.config("spark.ui.enabled", false);

	SparkSession sparkSession = builder.getOrCreate();

	SparkContext sparkContext = sparkSession.sparkContext();
	sparkContext.setLogLevel("ERROR");

	return sparkSession;
}
 
Example 12
Source File: GroupInteractionExtractor.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a Dataset of pairwise interactions that satisfy the criteria of
 * the {@link InteractionFilter}. Each atom, its interacting neighbor atom, and 
 * the interaction distance is represented as a row.
 * 
 * @param structures a set of PDB structures
 * @return filter criteria for determining noncovalent interactions
 * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter
 */
public static Dataset<Row> getPairInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) {
	SparkSession spark = SparkSession.builder().getOrCreate();	
	@SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // calculate interactions
	boolean pairwise = true;
	JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise));
	
	// convert JavaRDD to Dataset
	return spark.createDataFrame(rows, AtomInteraction.getPairInteractionSchema());
}
 
Example 13
Source File: GroupInteractionExtractor.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
    * Returns a dataset of interactions that satisfy the criteria of
    * the {@link InteractionFilter}. Each atom and its interacting neighbor atoms
    * are represented as a row in a Dataset. In addition, geometric features 
    * of the interactions, such as distances, angles, and orientational order 
    * parameters are returned in each row (see {@link edu.sdsc.mm.dev.utils.CoordinationGeometry}).
    * 
    * @param structures a set of PDB structures
    * @return filter criteria for determining noncovalent interactions
    * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter
    * @see edu.sdsc.mm.dev.utils.CoordinationGeometry
 */
public static Dataset<Row> getInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) {
	SparkSession spark = SparkSession.builder().getOrCreate();
	@SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

	// calculate interactions
	boolean pairwise = false;
	JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise));
	
	// convert JavaRDD to Dataset
	return spark.createDataFrame(rows, AtomInteraction.getSchema(filter.getMaxInteractions()));
}
 
Example 14
Source File: G2SDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Downloads PDB residue mappings for a list of genomic variations.
 * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C)
 * @param pdbId specific PDB structure used for mapping
 * @param chainId specific chain used for mapping
 * @return dataset with PDB mapping information
 * @throws IOException
 */
private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();    
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); 

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData); 
    dataset.show();
    
    // return null if dataset is empty
    if (dataset.columns().length == 0) {
        System.out.println("G2SDataset: no matches found");
        return null;
    }   
       
    dataset = standardizeData(dataset);
    
    return flattenDataset(dataset);
}
 
Example 15
Source File: BuildDataFrameFromScratch.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String> stringAsList = new ArrayList<>();
  stringAsList.add("bar");

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes.createStructType(
      new StructField[] { DataTypes.createStructField("foe",
          DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}
 
Example 16
Source File: HoodieJavaStreamingApp.java    From hudi with Apache License 2.0 4 votes vote down vote up
/**
 *
 * @throws Exception
 */
public void run() throws Exception {
  // Spark session setup..
  SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
  JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());

  // folder path clean up and creation, preparing the environment
  FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
  fs.delete(new Path(streamingSourcePath), true);
  fs.delete(new Path(streamingCheckpointingPath), true);
  fs.delete(new Path(tablePath), true);
  fs.mkdirs(new Path(streamingSourcePath));

  // Generator of some records to be loaded in.
  HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();

  List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100));
  Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

  List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100));

  Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));

  // setup the input for streaming
  Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath);


  // start streaming and showing
  ExecutorService executor = Executors.newFixedThreadPool(2);

  // thread for spark strucutured streaming
  Future<Void> streamFuture = executor.submit(() -> {
    LOG.info("===== Streaming Starting =====");
    stream(streamingInput);
    LOG.info("===== Streaming Ends =====");
    return null;
  });

  // thread for adding data to the streaming source and showing results over time
  Future<Void> showFuture = executor.submit(() -> {
    LOG.info("===== Showing Starting =====");
    show(spark, fs, inputDF1, inputDF2);
    LOG.info("===== Showing Ends =====");
    return null;
  });

  // let the threads run
  streamFuture.get();
  showFuture.get();

  executor.shutdown();
}
 
Example 17
Source File: GeoWaveSparkSQLIT.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Test
public void testCreateDataFrame() throws Exception {
  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();
  final SparkContext context = session.sparkContext();

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  try {
    // Load RDD from datastore, no filters
    final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions());
    final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD();

    final long count = javaRdd.count();
    LOGGER.warn("DataStore loaded into RDD with " + count + " features.");

    queryRunner.addInputStore(dataStore, null, "features");

    final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))";

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)");

    Dataset<Row> results = queryRunner.run();
    final long containsCount = results.count();
    LOGGER.warn("Got " + containsCount + " for GeomContains test");

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))");
    results = queryRunner.run();
    final long withinCount = results.count();
    LOGGER.warn("Got " + withinCount + " for GeomWithin test");

    Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount);

    // Test the output writer
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore);

    sqlResultsWriter.writeResults("sqltest");

    queryRunner.removeAllStores();

    // Test other spatial UDFs
    final String line1 = "LINESTRING(0 0, 10 10)";
    final String line2 = "LINESTRING(0 10, 10 0)";
    queryRunner.setSql(
        "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    Row result = queryRunner.run().head();

    final boolean intersect = result.getBoolean(0);
    LOGGER.warn("GeomIntersects returned " + intersect);

    Assert.assertTrue("Lines should intersect", intersect);

    queryRunner.setSql(
        "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    result = queryRunner.run().head();

    final boolean disjoint = result.getBoolean(0);
    LOGGER.warn("GeomDisjoint returned " + disjoint);

    Assert.assertFalse("Lines should not be disjoint", disjoint);

  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while testing a bounding box query of spatial index: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}
 
Example 18
Source File: ProcessVendorTrasactions.java    From aws-big-data-blog with Apache License 2.0 4 votes vote down vote up
public static void run(String jobInputParam) throws Exception{
	
   	List<StructField> schemaFields = new ArrayList<StructField>();
   	schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true));
   	StructType schema = DataTypes.createStructType(schemaFields);

   	SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys");
   	SparkSession spark = SparkSession.builder().config(conf).getOrCreate();	
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
			
	String redshiftJDBCURL=props.getProperty("redshift.jdbc.url");
	String s3TempPath = props.getProperty("s3.temp.path");
	System.out.println("props"+props);
	
	JavaRDD<Row> salesRDD = sc.textFile(jobInputParam).
			map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(",");
		      return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}});
	Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema);
	Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	String[] joinColArray = {"vendor_id","item_id","trans_date"};
	vendorItemSaleAmountDF.printSchema();
	Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq();

	Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer")
							 .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer")
							 .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount");
	
	vendorAggregatedDF.printSchema();
	DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain();
	AWSSessionCredentials creds  = (AWSSessionCredentials) provider.getCredentials();
	
	String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString();
	String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp")
			 .append(appendix)
			 .append(" where vendortranssummary.vendor_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp")
			 .append(appendix)
			 .append(".trans_date;")
			 .append("insert into vendortranssummary select * from vendortranssummary_temp")
			 .append(appendix)
			 .append(";drop table vendortranssummary_temp")
			 .append(appendix)
			 .append(";end transaction;").toString();
	vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL)
    .option("dbtable", "vendortranssummary_temp"+appendix)
    .option("usestagingtable","false")
    .option("postactions",vendorTransSummarySQL)
    .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId())
    .option("temporary_aws_secret_access_key",creds.getAWSSecretKey())
    .option("temporary_aws_session_token", creds.getSessionToken())
    .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save();
		
}
 
Example 19
Source File: StateLessProcessingExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 3 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");

		SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("stateless Streaming Example")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
				Durations.milliseconds(1000));
		JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999);

		JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> {
			ObjectMapper mapper = new ObjectMapper();
			return mapper.readValue(x, FlightDetails.class);
		});
		
		
		
		//flightDetailsStream.print();
		
		//flightDetailsStream.foreachRDD((VoidFunction<JavaRDD<FlightDetails>>) rdd -> rdd.saveAsTextFile("hdfs://namenode:port/path"));
		
	   JavaDStream<FlightDetails> window = flightDetailsStream.window(Durations.minutes(5),Durations.minutes(1));
		
	    JavaPairDStream<String, Double> transfomedWindow = window.mapToPair(f->new Tuple2<String,Double>(f.getFlightId(),f.getTemperature())).
	    mapValues(t->new Tuple2<Double,Integer>(t,1))
	    .reduceByKey((t1, t2) -> new Tuple2<Double, Integer>(t1._1()+t2._1(), t1._2()+t2._2())).mapValues(t -> t._1()/t._2());
	    transfomedWindow.cache();
	    transfomedWindow.print();
	    
		jssc.start();
		jssc.awaitTermination();
	}
 
Example 20
Source File: ValueSetUdfs.java    From bunsen with Apache License 2.0 3 votes vote down vote up
/**
 * Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content.
 *
 * @param spark the spark session
 * @param valueSets the valuesets to use in the UDF
 */
public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) {

  JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext());

  Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets);

  pushUdf(spark, broadcast);
}