org.apache.spark.sql.SparkSession Java Exaples

Source File: MyVariantDataset.java From mmtf-spark with Apache License 2.0

8 votes

/**
 * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query.
 * See <a href="http://myvariant.info/docs/">query syntax</a>.
 * <p> Example:
 * <pre>
 * String query = "clinvar.rcv.clinical_significance:pathogenic " 
 *                + "OR clinvar.rcv.clinical_significance:likely pathogenic";
 * </pre>
 * 
 * @param uniprotIds list of Uniprot Ids
 * @param query MyVariant.info query string
 * @return dataset with variation Ids and Uniprot Ids or null if no data are found
 * @throws IOException
 */
public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING());

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData);

    // return null if dataset contains no results
    if (!Arrays.asList(dataset.columns()).contains("hits")) {
        System.out.println("MyVariantDataset: no matches found");
        return null;
    }

    return flattenDataset(dataset);
}

Source File: GraphLoader.java From tutorials with MIT License

6 votes

public GraphFrame getGraphFrameUserRelationship() throws IOException {
    Path temp = Files.createTempDirectory("sparkGraphFrames");
    SparkSession session = SparkSession.builder()
        .appName("SparkGraphFrameSample")
        .config("spark.sql.warehouse.dir", temp.toString())
        .sparkContext(getSparkContext().sc())
        .master("local[*]")
        .getOrCreate();
    List<User> users = loadUsers();

    Dataset<Row> userDataset = session.createDataFrame(users, User.class);

    List<Relationship> relationshipsList = getRelations();
    Dataset<Row> relationshipDataset = session.createDataFrame(relationshipsList, Relationship.class);

    GraphFrame graphFrame = new GraphFrame(userDataset, relationshipDataset);

    return graphFrame;
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}

Source File: global.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();

    Dataset<Row> dataset = sparkSession.read().json("URL");
    try {
        //创建全局临时视图
        dataset.createGlobalTempView("user");
        //全局临时视图绑定到系统保存的数据库“global_temp”
        Dataset<Row> globalUser = sparkSession.sql("SELECT * FROM global_temp.user");
        sparkSession.newSession().sql("SELECT * FROM global_temp.user");
    } catch (AnalysisException e) {
        e.printStackTrace();
    }
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T14: Append mode " + containerOut);
  String o1 = containerOut + "myData";
  try {
    createAppendObject("T14 - first append", schemaFlights, o1, type);
    long baseCount = schemaFlights.count();
    System.out
        .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1);
    createAppendObject("T14 - second append", schemaFlights, o1, type);
    baseCount = schemaFlights.count();
    System.out
        .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), true);
  }
}

Source File: AutomatedTestBase.java From systemds with Apache License 2.0

6 votes

/**
 * Create a SystemDS-preferred Spark Session.
 *
 * @param appName the application name
 * @param master the master value (ie, "local", etc)
 * @return Spark Session
 */
public static SparkSession createSystemDSSparkSession(String appName, String master) {
	Builder builder = SparkSession.builder();
	if (appName != null) {
		builder.appName(appName);
	}
	if (master != null) {
		builder.master(master);
	}
	builder.config("spark.driver.maxResultSize", "0");
	if (SparkExecutionContext.FAIR_SCHEDULER_MODE) {
		builder.config("spark.scheduler.mode", "FAIR");
	}
	builder.config("spark.locality.wait", "5s");
	SparkSession spark = builder.getOrCreate();
	return spark;
}

Source File: BooksCsvToDataset.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("Book CSV to Dataset")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  // @formatter:off
Dataset<Row> df = spark
		.read()
		.format("csv")
		.option("inferSchema", "false") // We are not inferring the schema for now
		.option("header", "true")
		.load(filename);
// @formatter:on
  df.show();

  // In this case everything is a string
  df.printSchema();
}

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) throws AnalysisException {
  // $example on:init_session$
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark SQL basic example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate();
  // $example off:init_session$

  runBasicDataFrameExample(spark);
  runDatasetCreationExample(spark);
  runInferSchemaExample(spark);
  runProgrammaticSchemaExample(spark);

  spark.stop();
}

Source File: RDDConverterUtilsExtTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testStringDataFrameToVectorDataFrameNull() {
	List<String> list = new ArrayList<>();
	list.add("[1.2, 3.4]");
	list.add(null);
	JavaRDD<String> javaRddString = sc.parallelize(list);
	JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow());
	SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema);
	Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF);

	List<String> expectedResults = new ArrayList<>();
	expectedResults.add("[[1.2,3.4]]");
	expectedResults.add("[null]");

	List<Row> outputList = outDF.collectAsList();
	for (Row row : outputList) {
		assertTrue("Expected results don't contain: " + row, expectedResults.contains(row.toString()));
	}
}

Source File: JavaIgniteDataFrameExample.java From ignite with Apache License 2.0

6 votes

/** */
private static void nativeSparkSqlExample(SparkSession spark) {
    System.out.println("Querying using Spark SQL.");

    Dataset<Row> df = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type.
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read.
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config.
            .load();

    //Registering DataFrame as Spark view.
    df.createOrReplaceTempView("person");

    //Selecting data from Ignite through Spark SQL Engine.
    Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'");

    System.out.println("Result schema:");

    igniteDF.printSchema(); //Printing query schema to console.

    System.out.println("Result content:");

    igniteDF.show(); //Printing query results to console.
}

Source File: BasicExternalUdfFromTextFile.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
      .master("local").getOrCreate();

  spark.udf().register("x2Multiplier", new Multiplier2(),
      DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
      DataTypes.IntegerType)));
  df.show();
}

Source File: DefaultMetricCollector.java From ExecDashboard with Apache License 2.0

6 votes

private List<String> getCollectorItemListForLobs(List<Lob> lobList, SparkSession sparkSession, JavaSparkContext javaSparkContext) {
    dashboardCollectorItemsMap
            = DashBoardCollectorItemMapBuilder.getDashboardNameCollectorItemsMapById(getCollectorType(), sparkSession, javaSparkContext);

    List<String> collectorItemList = new ArrayList<>();
    Optional.ofNullable(lobList).orElseGet(Collections::emptyList).stream()
            .map(Lob::getProducts)
            .forEach(products -> products.stream()
                    .map(Product::getProductComponentList)
                    .forEach(productComponents -> productComponents
                            .stream()
                            .map(ProductComponent::getProductComponentDashboardId)
                            .filter(Objects::nonNull)
                            .<List<String>>map(dashboardId -> dashboardCollectorItemsMap.get(dashboardId.toString()) != null ? dashboardCollectorItemsMap.get(dashboardId.toString()) : new ArrayList<>())
                            .forEach(collectorItemList::addAll)));
    return collectorItemList;
}

Source File: JavaShakespeare.java From spark-bigquery-connector with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder()
            .appName("spark-bigquery-demo")
            .getOrCreate();

    // Use the Cloud Storage bucket for temporary BigQuery export data used
    // by the connector. This assumes the Cloud Storage connector for
    // Hadoop is configured.
    String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket");
    spark.conf().set("temporaryGcsBucket", bucket);

    // Load data in from BigQuery.
    Dataset<Row> wordsDF = spark.read().format("bigquery")
            .option("table", "bigquery-public-data.samples.shakespeare").load().cache();
    wordsDF.show();
    wordsDF.printSchema();
    wordsDF.createOrReplaceTempView("words");

    // Perform word count.
    Dataset<Row> wordCountDF = spark.sql(
            "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word");

    // Saving the data to BigQuery
    wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output")
            .save();
}

Source File: CustomReportService.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Returns a dataset with the specified columns for all current PDB entries.
 * See <a href="https://www.rcsb.org/pdb/results/reportField.do"> for list
 * of supported field names</a>
 * 
 * @param columnNames
 *            names of the columns for the dataset
 * @return dataset with the specified columns
 * @throws IOException
 *             when temporary csv file cannot be created
 */
public static Dataset<Row> getDataset(String... columnNames) throws IOException {
	// form query URL
	String query = CURRENT_URL + columNamesString(columnNames);

	// run tabular report query
	InputStream input = postQuery(query);

	// save as a temporary CSV file
	Path tempFile = saveTempFile(input);

	SparkSession spark = SparkSession.builder().getOrCreate();

	// load temporary CSV file into Spark dataset
	Dataset<Row> dataset = readCsv(spark, tempFile.toString());

	return concatIds(spark, dataset, columnNames);
}

Source File: ExternalTableIT.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Test
public void testParquetColumnName() throws Exception {
    String tablePath = getExternalResourceDirectory()+"parquet_colname";
    methodWatcher.execute(String.format("create external table t_parquet (col1 int, col2 varchar(5))" +
            " STORED AS PARQUET LOCATION '%s'", tablePath));
    methodWatcher.execute("insert into t_parquet values (1, 'A')");
    SparkSession spark = SparkSession.builder()
            .master("local")
            .appName("ExternaltableIT")
            .getOrCreate();

    Dataset dataset = spark
            .read()
            .parquet(tablePath);
    String actual = dataset.schema().toString();
    String expected = "StructType(StructField(COL1,IntegerType,true), StructField(COL2,StringType,true))";
    Assert.assertEquals(actual, expected, actual);
}

Source File: JavaWord2VecExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaWord2VecExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> documentDF = spark.createDataFrame(data, schema);

  // Learn a mapping from words to Vectors.
  Word2Vec word2Vec = new Word2Vec()
    .setInputCol("text")
    .setOutputCol("result")
    .setVectorSize(3)
    .setMinCount(0);

  Word2VecModel model = word2Vec.fit(documentDF);
  Dataset<Row> result = model.transform(documentDF);

  for (Row row : result.collectAsList()) {
    List<String> text = row.getList(0);
    Vector vector = (Vector) row.get(1);
    System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
  }
  // $example off$

  spark.stop();
}

Source File: SparkFactoryImpl.java From beakerx with Apache License 2.0

5 votes

private Optional<SparkSessionBuilder> getBuilderFromUser(Object result) {
  if (result instanceof SparkConf) {
    return of(sparkSessionBuilderFactory.newInstance((SparkConf) result));
  } else if (result instanceof SparkSession.Builder) {
    return of(sparkSessionBuilderFactory.newInstance((SparkSession.Builder) result));
  } else {
    return Optional.empty();
  }
}

Source File: MLContextUtil.java From systemds with Apache License 2.0

5 votes

/**
 * Check that the Spark version is supported. If it isn't supported, throw
 * an MLContextException.
 *
 * @param spark
 *            SparkSession
 * @throws MLContextException
 *             thrown if Spark version isn't supported
 */
public static void verifySparkVersionSupported(SparkSession spark) {
	String minimumRecommendedSparkVersion = null;
	try {
		// If this is being called using the SystemDS jar file,
		// ProjectInfo should be available.
		ProjectInfo projectInfo = ProjectInfo.getProjectInfo();
		minimumRecommendedSparkVersion = projectInfo.minimumRecommendedSparkVersion();
	} catch (MLContextException e) {
		try {
			// During development (such as in an IDE), there is no jar file
			// typically
			// built, so attempt to obtain the minimum recommended Spark
			// version from
			// the pom.xml file
			minimumRecommendedSparkVersion = getMinimumRecommendedSparkVersionFromPom();
		} catch (MLContextException e1) {
			throw new MLContextException(
					"Minimum recommended Spark version could not be determined from SystemDS jar file manifest or pom.xml");
		}
	}
	String sparkVersion = spark.version();
	if (!MLContextUtil.isSparkVersionSupported(sparkVersion, minimumRecommendedSparkVersion)) {
		throw new MLContextException(
				"Spark " + sparkVersion + " or greater is recommended for this version of SystemDS.");
	}
}

Source File: DataPreview.java From StockPrediction with MIT License

5 votes

public static void main (String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate();
    String filename = "prices-split-adjusted.csv";
    String symbol = "GOOG";
    // load data from csv file
    Dataset<Row> data = spark.read().format("csv").option("header", true)
            .load(new ClassPathResource(filename).getFile().getAbsolutePath())
            //.filter(functions.col("symbol").equalTo(symbol))
            //.drop("date").drop("symbol")
            .withColumn("openPrice", functions.col("open").cast("double")).drop("open")
            .withColumn("closePrice", functions.col("close").cast("double")).drop("close")
            .withColumn("lowPrice", functions.col("low").cast("double")).drop("low")
            .withColumn("highPrice", functions.col("high").cast("double")).drop("high")
            .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume")
            .toDF("date", "symbol", "open", "close", "low", "high", "volume");

    data.show();

    Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count"));
    System.out.println("Number of Symbols: " + symbols.count());
    symbols.show();

    VectorAssembler assembler = new VectorAssembler()
            .setInputCols(new String[] {"open", "low", "high", "volume", "close"})
            .setOutputCol("features");

    data = assembler.transform(data).drop("open", "low", "high", "volume", "close");

    data = new MinMaxScaler().setMin(0).setMax(1)
            .setInputCol("features").setOutputCol("normalizedFeatures")
            .fit(data).transform(data)
            .drop("features").toDF("features");
}

Source File: MLContextMultipleScriptsTest.java From systemds with Apache License 2.0

5 votes

private static void runMLContextTestMultipleScript(ExecMode platform, boolean wRead) 
{
	ExecMode oldplatform = DMLScript.getGlobalExecMode();
	DMLScript.setGlobalExecMode(platform);
	
	//create mlcontext
	SparkSession spark = createSystemDSSparkSession("MLContextMultipleScriptsTest", "local");
	MLContext ml = new MLContext(spark);
	ml.setExplain(true);

	String dml1 = baseDirectory + File.separator + "MultiScript1.dml";
	String dml2 = baseDirectory + File.separator + (wRead?"MultiScript2b.dml":"MultiScript2.dml");
	String dml3 = baseDirectory + File.separator + (wRead?"MultiScript3b.dml":"MultiScript3.dml");
	
	try
	{
		//run script 1
		Script script1 = dmlFromFile(dml1).in("$rows", rows).in("$cols", cols).out("X");
		Matrix X = ml.execute(script1).getMatrix("X");
		
		Script script2 = dmlFromFile(dml2).in("X", X).out("Y");
		Matrix Y = ml.execute(script2).getMatrix("Y");
		
		Script script3 = dmlFromFile(dml3).in("X", X).in("Y",Y).out("z");
		String z = ml.execute(script3).getString("z");
		
		System.out.println(z);
	}
	finally {
		DMLScript.setGlobalExecMode(oldplatform);
		
		// stop underlying spark context to allow single jvm tests (otherwise the
		// next test that tries to create a SparkContext would fail)
		spark.stop();
		// clear status mlcontext and spark exec context
		ml.close();
	}
}

Source File: JavaIgniteDataFrameWriteExample.java From ignite with Apache License 2.0

5 votes

/** */
private static void editDataAndSaveToNewTable(Ignite ignite, SparkSession spark) {
    //Load content of Ignite table to data frame.
    Dataset<Row> personDataFrame = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person")
            .load();

    System.out.println("Data frame content:");

    //Printing content of data frame to console.
    personDataFrame.show();

    System.out.println("Modifying Data Frame and write it to Ignite:");

    personDataFrame
            .withColumn("id", col("id").plus(42)) //Edit id column
            .withColumn("name", reverse(col("name"))) //Edit name column
            .write().format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "new_persons")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS(), "id, city_id")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS(), "backups=1")
            .mode(SaveMode.Overwrite) //Overwriting entire table.
            .save();

    System.out.println("Done!");

    System.out.println("Reading data from Ignite table:");

    CacheConfiguration<?, ?> ccfg = new CacheConfiguration<>(CACHE_NAME);

    IgniteCache<?, ?> cache = ignite.getOrCreateCache(ccfg);

    //Reading saved data from Ignite.
    List<List<?>> data = cache.query(new SqlFieldsQuery("SELECT id, name, city_id FROM new_persons")).getAll();

    System.out.println(data);
}

Source File: JavaTC.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}

Source File: DrugBankDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Reads CSV file into a Spark dataset
 * 
 * @param fileName
 * @throws IOException
 */
private static Dataset<Row> readCsv(String inputFileName) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();

    Dataset<Row> dataset = spark.read().format("csv").option("header", "true").option("inferSchema", "true")
            .load(inputFileName);

    return dataset;
}

Source File: DefaultMetricCollector.java From ExecDashboard with Apache License 2.0

5 votes

public void collect(SparkSession sparkSession, JavaSparkContext javaSparkContext, List<?> objectList) {
    if ((sparkSession == null) || (javaSparkContext == null) || CollectionUtils.isEmpty(objectList)) { return; }

    if (objectList.get(0) instanceof Portfolio){
        collectPortFolioMetrics(sparkSession, javaSparkContext, (List<Portfolio>) objectList);
        return;
    }
    if (objectList.get(0) instanceof Lob){
        collectLobMetrics(sparkSession, javaSparkContext, (List<Lob>) objectList);
        return;
    }
}

Source File: TextFileToDataset2.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Dataset from Text File")
      .master("local[*]")
      .getOrCreate();

  String filename = "data/simple-data-file.txt";
  Dataset<Row> df = spark.read().text(filename);
  df.show();
}

Source File: DefaultDataCollector.java From ExecDashboard with Apache License 2.0

5 votes

DefaultDataCollector(String collectionName, String query, List<String> collectorItemIds, SparkSession sparkSession, JavaSparkContext javaSparkContext, PortfolioCollectorSetting portfolioCollectorSetting) {
    this.collectionName = collectionName;
    this.query = query;
    this.collectorItemIds = collectorItemIds;
    this.sparkSession = sparkSession;
    this.javaSparkContext = javaSparkContext;
    this.portfolioCollectorSetting = portfolioCollectorSetting;
}

Source File: JavaLDAExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  // Creates a SparkSession
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLDAExample")
    .getOrCreate();

  // $example on$
  // Loads data.
  Dataset<Row> dataset = spark.read().format("libsvm")
    .load("data/mllib/sample_lda_libsvm_data.txt");

  // Trains a LDA model.
  LDA lda = new LDA().setK(10).setMaxIter(10);
  LDAModel model = lda.fit(dataset);

  double ll = model.logLikelihood(dataset);
  double lp = model.logPerplexity(dataset);
  System.out.println("The lower bound on the log likelihood of the entire corpus: " + ll);
  System.out.println("The upper bound bound on perplexity: " + lp);

  // Describe topics.
  Dataset<Row> topics = model.describeTopics(3);
  System.out.println("The topics described by their top-weighted terms:");
  topics.show(false);

  // Shows the result.
  Dataset<Row> transformed = model.transform(dataset);
  transformed.show(false);
  // $example off$

  spark.stop();
}

Source File: PdbMetadataDemo.java From mmtf-spark with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
 SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbMetadataDemo.class.getSimpleName())
            .getOrCreate();

 // query the following fields from the _citation category using PDBj's Mine2 web service:
 // journal_abbrev, pdbx_database_id_PubMed, year.   
 // Note, mixed case column names must be quoted and escaped with \".
 String sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'";
 Dataset<Row>ds = PdbjMineDataset.getDataset(sqlQuery);
 
 System.out.println("First 10 results from query: " + sqlQuery);
 ds.show(10, false);
  
 // filter out unpublished entries (they contain the word "published" in various upper/lower case combinations)
 ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'");
 
 // print the top 10 journals
 System.out.println("Top 10 journals that publish PDB structures:");
 ds.groupBy("journal_abbrev").count().sort(col("count").desc()).show(10, false);
	
 // filter out entries without a PubMed Id (is -1 if PubMed Id is not available)
 ds = ds.filter("pdbx_database_id_PubMed > 0");
 System.out.println("Entries with PubMed Ids: " + ds.count());
 
 // show growth of papers in PubMed
 System.out.println("PubMed Ids per year: ");
 ds.groupBy("year").count().sort(col("year").desc()).show(10, false);

 spark.close();
}

Source File: Bundles.java From bunsen with Apache License 2.0

5 votes

/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type.
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param resourceClass the type of resource to extract.
 * @return a dataset of the given resource
 */
public Dataset<Row> extractEntry(SparkSession spark,
    JavaRDD<BundleContainer> bundles,
    Class resourceClass) {

  RuntimeResourceDefinition definition = FhirContexts.contextFor(fhirVersion)
      .getResourceDefinition(resourceClass);

  return extractEntry(spark, bundles, definition.getName());
}

Source File: ConceptMaps.java From bunsen with Apache License 2.0

5 votes

/**
 * Returns an empty ConceptMaps instance.
 *
 * @param spark the spark session
 * @return an empty ConceptMaps instance.
 */
public static ConceptMaps getEmpty(SparkSession spark) {

  Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER)
      .withColumn("timestamp", lit(null).cast("timestamp"))
      .as(CONCEPT_MAP_ENCODER);

  return new ConceptMaps(spark,
      spark.emptyDataset(URL_AND_VERSION_ENCODER),
      emptyConceptMaps,
      spark.emptyDataset(MAPPING_ENCODER));
}

org.apache.spark.sql.SparkSession Java Examples