org.apache.spark.sql.SparkSession Java Examples
The following examples show how to use
org.apache.spark.sql.SparkSession.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MyVariantDataset.java From mmtf-spark with Apache License 2.0 | 8 votes |
/** * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query. * See <a href="http://myvariant.info/docs/">query syntax</a>. * <p> Example: * <pre> * String query = "clinvar.rcv.clinical_significance:pathogenic " * + "OR clinvar.rcv.clinical_significance:likely pathogenic"; * </pre> * * @param uniprotIds list of Uniprot Ids * @param query MyVariant.info query string * @return dataset with variation Ids and Uniprot Ids or null if no data are found * @throws IOException */ public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); // return null if dataset contains no results if (!Arrays.asList(dataset.columns()).contains("hits")) { System.out.println("MyVariantDataset: no matches found"); return null; } return flattenDataset(dataset); }
Example #2
Source File: GraphLoader.java From tutorials with MIT License | 6 votes |
public GraphFrame getGraphFrameUserRelationship() throws IOException { Path temp = Files.createTempDirectory("sparkGraphFrames"); SparkSession session = SparkSession.builder() .appName("SparkGraphFrameSample") .config("spark.sql.warehouse.dir", temp.toString()) .sparkContext(getSparkContext().sc()) .master("local[*]") .getOrCreate(); List<User> users = loadUsers(); Dataset<Row> userDataset = session.createDataFrame(users, User.class); List<Relationship> relationshipsList = getRelations(); Dataset<Row> relationshipDataset = session.createDataFrame(relationshipsList, Relationship.class); GraphFrame graphFrame = new GraphFrame(userDataset, relationshipDataset); return graphFrame; }
Example #3
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type) throws Exception { System.out.println("*********************************"); System.out.println("T16: Non overwrite mode " + containerOut); String o1 = containerOut + "myData/123"; StructType schema = DataTypes .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false), DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false), DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), }); Row r1 = RowFactory.create("name1", "value1", 1); Row r2 = RowFactory.create("name2", "value2", 2); List<Row> rowList = ImmutableList.of(r1, r2); Dataset<Row> rows = spark.createDataFrame(rowList, schema); try { if (type.equals(Constants.PARQUET_TYPE)) { rows.write().mode(SaveMode.Overwrite).parquet(o1); } else if (type.equals(Constants.JSON_TYPE)) { rows.write().mode(SaveMode.Overwrite).json(o1); } } catch (Exception e) { deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate); throw e; } finally { deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate); } }
Example #4
Source File: global.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Dataset<Row> dataset = sparkSession.read().json("URL"); try { //创建全局临时视图 dataset.createGlobalTempView("user"); //全局临时视图绑定到系统保存的数据库“global_temp” Dataset<Row> globalUser = sparkSession.sql("SELECT * FROM global_temp.user"); sparkSession.newSession().sql("SELECT * FROM global_temp.user"); } catch (AnalysisException e) { e.printStackTrace(); } }
Example #5
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type) throws Exception { System.out.println("*********************************"); System.out.println("T14: Append mode " + containerOut); String o1 = containerOut + "myData"; try { createAppendObject("T14 - first append", schemaFlights, o1, type); long baseCount = schemaFlights.count(); System.out .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type); readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1); createAppendObject("T14 - second append", schemaFlights, o1, type); baseCount = schemaFlights.count(); System.out .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type); readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2); } catch (Exception e) { throw e; } finally { deleteData(o1, spark.sparkContext().hadoopConfiguration(), true); } }
Example #6
Source File: AutomatedTestBase.java From systemds with Apache License 2.0 | 6 votes |
/** * Create a SystemDS-preferred Spark Session. * * @param appName the application name * @param master the master value (ie, "local", etc) * @return Spark Session */ public static SparkSession createSystemDSSparkSession(String appName, String master) { Builder builder = SparkSession.builder(); if (appName != null) { builder.appName(appName); } if (master != null) { builder.master(master); } builder.config("spark.driver.maxResultSize", "0"); if (SparkExecutionContext.FAIR_SCHEDULER_MODE) { builder.config("spark.scheduler.mode", "FAIR"); } builder.config("spark.locality.wait", "5s"); SparkSession spark = builder.getOrCreate(); return spark; }
Example #7
Source File: BooksCsvToDataset.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("Book CSV to Dataset") .master("local").getOrCreate(); String filename = "data/books.csv"; // @formatter:off Dataset<Row> df = spark .read() .format("csv") .option("inferSchema", "false") // We are not inferring the schema for now .option("header", "true") .load(filename); // @formatter:on df.show(); // In this case everything is a string df.printSchema(); }
Example #8
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) throws AnalysisException { // $example on:init_session$ SparkSession spark = SparkSession .builder() .appName("Java Spark SQL basic example") .config("spark.some.config.option", "some-value") .getOrCreate(); // $example off:init_session$ runBasicDataFrameExample(spark); runDatasetCreationExample(spark); runInferSchemaExample(spark); runProgrammaticSchemaExample(spark); spark.stop(); }
Example #9
Source File: RDDConverterUtilsExtTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testStringDataFrameToVectorDataFrameNull() { List<String> list = new ArrayList<>(); list.add("[1.2, 3.4]"); list.add(null); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow()); SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema); Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF); List<String> expectedResults = new ArrayList<>(); expectedResults.add("[[1.2,3.4]]"); expectedResults.add("[null]"); List<Row> outputList = outDF.collectAsList(); for (Row row : outputList) { assertTrue("Expected results don't contain: " + row, expectedResults.contains(row.toString())); } }
Example #10
Source File: JavaIgniteDataFrameExample.java From ignite with Apache License 2.0 | 6 votes |
/** */ private static void nativeSparkSqlExample(SparkSession spark) { System.out.println("Querying using Spark SQL."); Dataset<Row> df = spark.read() .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type. .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read. .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config. .load(); //Registering DataFrame as Spark view. df.createOrReplaceTempView("person"); //Selecting data from Ignite through Spark SQL Engine. Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'"); System.out.println("Result schema:"); igniteDF.printSchema(); //Printing query schema to console. System.out.println("Result content:"); igniteDF.show(); //Printing query results to console. }
Example #11
Source File: BasicExternalUdfFromTextFile.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); }
Example #12
Source File: DefaultMetricCollector.java From ExecDashboard with Apache License 2.0 | 6 votes |
private List<String> getCollectorItemListForLobs(List<Lob> lobList, SparkSession sparkSession, JavaSparkContext javaSparkContext) { dashboardCollectorItemsMap = DashBoardCollectorItemMapBuilder.getDashboardNameCollectorItemsMapById(getCollectorType(), sparkSession, javaSparkContext); List<String> collectorItemList = new ArrayList<>(); Optional.ofNullable(lobList).orElseGet(Collections::emptyList).stream() .map(Lob::getProducts) .forEach(products -> products.stream() .map(Product::getProductComponentList) .forEach(productComponents -> productComponents .stream() .map(ProductComponent::getProductComponentDashboardId) .filter(Objects::nonNull) .<List<String>>map(dashboardId -> dashboardCollectorItemsMap.get(dashboardId.toString()) != null ? dashboardCollectorItemsMap.get(dashboardId.toString()) : new ArrayList<>()) .forEach(collectorItemList::addAll))); return collectorItemList; }
Example #13
Source File: JavaShakespeare.java From spark-bigquery-connector with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession.builder() .appName("spark-bigquery-demo") .getOrCreate(); // Use the Cloud Storage bucket for temporary BigQuery export data used // by the connector. This assumes the Cloud Storage connector for // Hadoop is configured. String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket"); spark.conf().set("temporaryGcsBucket", bucket); // Load data in from BigQuery. Dataset<Row> wordsDF = spark.read().format("bigquery") .option("table", "bigquery-public-data.samples.shakespeare").load().cache(); wordsDF.show(); wordsDF.printSchema(); wordsDF.createOrReplaceTempView("words"); // Perform word count. Dataset<Row> wordCountDF = spark.sql( "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word"); // Saving the data to BigQuery wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output") .save(); }
Example #14
Source File: CustomReportService.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Returns a dataset with the specified columns for all current PDB entries. * See <a href="https://www.rcsb.org/pdb/results/reportField.do"> for list * of supported field names</a> * * @param columnNames * names of the columns for the dataset * @return dataset with the specified columns * @throws IOException * when temporary csv file cannot be created */ public static Dataset<Row> getDataset(String... columnNames) throws IOException { // form query URL String query = CURRENT_URL + columNamesString(columnNames); // run tabular report query InputStream input = postQuery(query); // save as a temporary CSV file Path tempFile = saveTempFile(input); SparkSession spark = SparkSession.builder().getOrCreate(); // load temporary CSV file into Spark dataset Dataset<Row> dataset = readCsv(spark, tempFile.toString()); return concatIds(spark, dataset, columnNames); }
Example #15
Source File: ExternalTableIT.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Test public void testParquetColumnName() throws Exception { String tablePath = getExternalResourceDirectory()+"parquet_colname"; methodWatcher.execute(String.format("create external table t_parquet (col1 int, col2 varchar(5))" + " STORED AS PARQUET LOCATION '%s'", tablePath)); methodWatcher.execute("insert into t_parquet values (1, 'A')"); SparkSession spark = SparkSession.builder() .master("local") .appName("ExternaltableIT") .getOrCreate(); Dataset dataset = spark .read() .parquet(tablePath); String actual = dataset.schema().toString(); String expected = "StructType(StructField(COL1,IntegerType,true), StructField(COL2,StringType,true))"; Assert.assertEquals(actual, expected, actual); }
Example #16
Source File: JavaWord2VecExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaWord2VecExample") .getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) ); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> documentDF = spark.createDataFrame(data, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); for (Row row : result.collectAsList()) { List<String> text = row.getList(0); Vector vector = (Vector) row.get(1); System.out.println("Text: " + text + " => \nVector: " + vector + "\n"); } // $example off$ spark.stop(); }
Example #17
Source File: SparkFactoryImpl.java From beakerx with Apache License 2.0 | 5 votes |
private Optional<SparkSessionBuilder> getBuilderFromUser(Object result) { if (result instanceof SparkConf) { return of(sparkSessionBuilderFactory.newInstance((SparkConf) result)); } else if (result instanceof SparkSession.Builder) { return of(sparkSessionBuilderFactory.newInstance((SparkSession.Builder) result)); } else { return Optional.empty(); } }
Example #18
Source File: MLContextUtil.java From systemds with Apache License 2.0 | 5 votes |
/** * Check that the Spark version is supported. If it isn't supported, throw * an MLContextException. * * @param spark * SparkSession * @throws MLContextException * thrown if Spark version isn't supported */ public static void verifySparkVersionSupported(SparkSession spark) { String minimumRecommendedSparkVersion = null; try { // If this is being called using the SystemDS jar file, // ProjectInfo should be available. ProjectInfo projectInfo = ProjectInfo.getProjectInfo(); minimumRecommendedSparkVersion = projectInfo.minimumRecommendedSparkVersion(); } catch (MLContextException e) { try { // During development (such as in an IDE), there is no jar file // typically // built, so attempt to obtain the minimum recommended Spark // version from // the pom.xml file minimumRecommendedSparkVersion = getMinimumRecommendedSparkVersionFromPom(); } catch (MLContextException e1) { throw new MLContextException( "Minimum recommended Spark version could not be determined from SystemDS jar file manifest or pom.xml"); } } String sparkVersion = spark.version(); if (!MLContextUtil.isSparkVersionSupported(sparkVersion, minimumRecommendedSparkVersion)) { throw new MLContextException( "Spark " + sparkVersion + " or greater is recommended for this version of SystemDS."); } }
Example #19
Source File: DataPreview.java From StockPrediction with MIT License | 5 votes |
public static void main (String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate(); String filename = "prices-split-adjusted.csv"; String symbol = "GOOG"; // load data from csv file Dataset<Row> data = spark.read().format("csv").option("header", true) .load(new ClassPathResource(filename).getFile().getAbsolutePath()) //.filter(functions.col("symbol").equalTo(symbol)) //.drop("date").drop("symbol") .withColumn("openPrice", functions.col("open").cast("double")).drop("open") .withColumn("closePrice", functions.col("close").cast("double")).drop("close") .withColumn("lowPrice", functions.col("low").cast("double")).drop("low") .withColumn("highPrice", functions.col("high").cast("double")).drop("high") .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume") .toDF("date", "symbol", "open", "close", "low", "high", "volume"); data.show(); Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count")); System.out.println("Number of Symbols: " + symbols.count()); symbols.show(); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] {"open", "low", "high", "volume", "close"}) .setOutputCol("features"); data = assembler.transform(data).drop("open", "low", "high", "volume", "close"); data = new MinMaxScaler().setMin(0).setMax(1) .setInputCol("features").setOutputCol("normalizedFeatures") .fit(data).transform(data) .drop("features").toDF("features"); }
Example #20
Source File: MLContextMultipleScriptsTest.java From systemds with Apache License 2.0 | 5 votes |
private static void runMLContextTestMultipleScript(ExecMode platform, boolean wRead) { ExecMode oldplatform = DMLScript.getGlobalExecMode(); DMLScript.setGlobalExecMode(platform); //create mlcontext SparkSession spark = createSystemDSSparkSession("MLContextMultipleScriptsTest", "local"); MLContext ml = new MLContext(spark); ml.setExplain(true); String dml1 = baseDirectory + File.separator + "MultiScript1.dml"; String dml2 = baseDirectory + File.separator + (wRead?"MultiScript2b.dml":"MultiScript2.dml"); String dml3 = baseDirectory + File.separator + (wRead?"MultiScript3b.dml":"MultiScript3.dml"); try { //run script 1 Script script1 = dmlFromFile(dml1).in("$rows", rows).in("$cols", cols).out("X"); Matrix X = ml.execute(script1).getMatrix("X"); Script script2 = dmlFromFile(dml2).in("X", X).out("Y"); Matrix Y = ml.execute(script2).getMatrix("Y"); Script script3 = dmlFromFile(dml3).in("X", X).in("Y",Y).out("z"); String z = ml.execute(script3).getString("z"); System.out.println(z); } finally { DMLScript.setGlobalExecMode(oldplatform); // stop underlying spark context to allow single jvm tests (otherwise the // next test that tries to create a SparkContext would fail) spark.stop(); // clear status mlcontext and spark exec context ml.close(); } }
Example #21
Source File: JavaIgniteDataFrameWriteExample.java From ignite with Apache License 2.0 | 5 votes |
/** */ private static void editDataAndSaveToNewTable(Ignite ignite, SparkSession spark) { //Load content of Ignite table to data frame. Dataset<Row> personDataFrame = spark.read() .format(IgniteDataFrameSettings.FORMAT_IGNITE()) .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") .load(); System.out.println("Data frame content:"); //Printing content of data frame to console. personDataFrame.show(); System.out.println("Modifying Data Frame and write it to Ignite:"); personDataFrame .withColumn("id", col("id").plus(42)) //Edit id column .withColumn("name", reverse(col("name"))) //Edit name column .write().format(IgniteDataFrameSettings.FORMAT_IGNITE()) .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) .option(IgniteDataFrameSettings.OPTION_TABLE(), "new_persons") .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS(), "id, city_id") .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS(), "backups=1") .mode(SaveMode.Overwrite) //Overwriting entire table. .save(); System.out.println("Done!"); System.out.println("Reading data from Ignite table:"); CacheConfiguration<?, ?> ccfg = new CacheConfiguration<>(CACHE_NAME); IgniteCache<?, ?> cache = ignite.getOrCreateCache(ccfg); //Reading saved data from Ignite. List<List<?>> data = cache.query(new SqlFieldsQuery("SELECT id, name, city_id FROM new_persons")).getAll(); System.out.println(data); }
Example #22
Source File: JavaTC.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTC") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2; JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.mapToPair( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<>(e._2(), e._1()); } }); long oldCount; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); spark.stop(); }
Example #23
Source File: DrugBankDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Reads CSV file into a Spark dataset * * @param fileName * @throws IOException */ private static Dataset<Row> readCsv(String inputFileName) throws IOException { SparkSession spark = SparkSession.builder().getOrCreate(); Dataset<Row> dataset = spark.read().format("csv").option("header", "true").option("inferSchema", "true") .load(inputFileName); return dataset; }
Example #24
Source File: DefaultMetricCollector.java From ExecDashboard with Apache License 2.0 | 5 votes |
public void collect(SparkSession sparkSession, JavaSparkContext javaSparkContext, List<?> objectList) { if ((sparkSession == null) || (javaSparkContext == null) || CollectionUtils.isEmpty(objectList)) { return; } if (objectList.get(0) instanceof Portfolio){ collectPortFolioMetrics(sparkSession, javaSparkContext, (List<Portfolio>) objectList); return; } if (objectList.get(0) instanceof Lob){ collectLobMetrics(sparkSession, javaSparkContext, (List<Lob>) objectList); return; } }
Example #25
Source File: TextFileToDataset2.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder() .appName("Dataset from Text File") .master("local[*]") .getOrCreate(); String filename = "data/simple-data-file.txt"; Dataset<Row> df = spark.read().text(filename); df.show(); }
Example #26
Source File: DefaultDataCollector.java From ExecDashboard with Apache License 2.0 | 5 votes |
DefaultDataCollector(String collectionName, String query, List<String> collectorItemIds, SparkSession sparkSession, JavaSparkContext javaSparkContext, PortfolioCollectorSetting portfolioCollectorSetting) { this.collectionName = collectionName; this.query = query; this.collectorItemIds = collectorItemIds; this.sparkSession = sparkSession; this.javaSparkContext = javaSparkContext; this.portfolioCollectorSetting = portfolioCollectorSetting; }
Example #27
Source File: JavaLDAExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { // Creates a SparkSession SparkSession spark = SparkSession .builder() .appName("JavaLDAExample") .getOrCreate(); // $example on$ // Loads data. Dataset<Row> dataset = spark.read().format("libsvm") .load("data/mllib/sample_lda_libsvm_data.txt"); // Trains a LDA model. LDA lda = new LDA().setK(10).setMaxIter(10); LDAModel model = lda.fit(dataset); double ll = model.logLikelihood(dataset); double lp = model.logPerplexity(dataset); System.out.println("The lower bound on the log likelihood of the entire corpus: " + ll); System.out.println("The upper bound bound on perplexity: " + lp); // Describe topics. Dataset<Row> topics = model.describeTopics(3); System.out.println("The topics described by their top-weighted terms:"); topics.show(false); // Shows the result. Dataset<Row> transformed = model.transform(dataset); transformed.show(false); // $example off$ spark.stop(); }
Example #28
Source File: PdbMetadataDemo.java From mmtf-spark with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbMetadataDemo.class.getSimpleName()) .getOrCreate(); // query the following fields from the _citation category using PDBj's Mine2 web service: // journal_abbrev, pdbx_database_id_PubMed, year. // Note, mixed case column names must be quoted and escaped with \". String sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'"; Dataset<Row>ds = PdbjMineDataset.getDataset(sqlQuery); System.out.println("First 10 results from query: " + sqlQuery); ds.show(10, false); // filter out unpublished entries (they contain the word "published" in various upper/lower case combinations) ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'"); // print the top 10 journals System.out.println("Top 10 journals that publish PDB structures:"); ds.groupBy("journal_abbrev").count().sort(col("count").desc()).show(10, false); // filter out entries without a PubMed Id (is -1 if PubMed Id is not available) ds = ds.filter("pdbx_database_id_PubMed > 0"); System.out.println("Entries with PubMed Ids: " + ds.count()); // show growth of papers in PubMed System.out.println("PubMed Ids per year: "); ds.groupBy("year").count().sort(col("year").desc()).show(10, false); spark.close(); }
Example #29
Source File: Bundles.java From bunsen with Apache License 2.0 | 5 votes |
/** * Extracts the given resource type from the RDD of bundles and returns * it as a Dataset of that type. * * @param spark the spark session * @param bundles an RDD of FHIR Bundles * @param resourceClass the type of resource to extract. * @return a dataset of the given resource */ public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles, Class resourceClass) { RuntimeResourceDefinition definition = FhirContexts.contextFor(fhirVersion) .getResourceDefinition(resourceClass); return extractEntry(spark, bundles, definition.getName()); }
Example #30
Source File: ConceptMaps.java From bunsen with Apache License 2.0 | 5 votes |
/** * Returns an empty ConceptMaps instance. * * @param spark the spark session * @return an empty ConceptMaps instance. */ public static ConceptMaps getEmpty(SparkSession spark) { Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER) .withColumn("timestamp", lit(null).cast("timestamp")) .as(CONCEPT_MAP_ENCODER); return new ConceptMaps(spark, spark.emptyDataset(URL_AND_VERSION_ENCODER), emptyConceptMaps, spark.emptyDataset(MAPPING_ENCODER)); }