Java Code Examples for org.apache.spark.sql.Dataset#printSchema()
The following examples show how to use
org.apache.spark.sql.Dataset#printSchema() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SwissModelDataset.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Flattens the original hierarchical data schema into a simple row-based * schema. Some less useful data are excluded. * * @param original * hierarchical dataset * @return flattened dataset */ private static Dataset<Row> flattenDataset(Dataset<Row> ds) { ds.printSchema(); return ds.withColumn("structures", explode(ds.col("result.structures"))).select(col("query.ac"), col("result.sequence"), col("structures.from"), col("structures.to"), col("structures.qmean"), col("structures.qmean_norm"), col("structures.gmqe"), col("structures.coverage"), col("structures.oligo-state"), col("structures.method"), col("structures.template"), col("structures.identity"), col("structures.similarity"), col("structures.coordinates"), col("result.md5"), col("structures.md5")); }
Example 2
Source File: DSSPDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); List<String> pdbIds = Arrays.asList("1STP"); // single protein chain JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); pdb = pdb.flatMapToPair(new StructureToPolymerChains()); Dataset<Row> ds = SecondaryStructureExtractor.getDataset(pdb); // show the schema of this dataset ds.printSchema(); ds.show(2, false); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 3
Source File: ReadLinesFromFileStream.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() throws TimeoutException { log.debug("-> start()"); SparkSession spark = SparkSession.builder() .appName("Read lines over a file stream") .master("local") .getOrCreate(); Dataset<Row> df = spark .readStream() .format("text") .load(StreamingUtils.getInputDirectory()); StreamingQuery query = df .writeStream() .outputMode(OutputMode.Update()) .format("console") .start(); try { query.awaitTermination(); } catch (StreamingQueryException e) { log.error( "Exception while waiting for query to end {}.", e.getMessage(), e); } // Never executed df.show(); df.printSchema(); }
Example 4
Source File: ReducerApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().master("local").getOrCreate(); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); Dataset<Integer> df = spark.createDataset(data, Encoders.INT()); df.show(); df.printSchema(); Integer sumByReduce = df.reduce(new SumByReduce()); System.out.println("Sum should be 55 and it is... " + sumByReduce); }
Example 5
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runJsonDatasetExample(SparkSession spark) { // $example on:json_dataset$ // A JSON dataset is pointed to by path. // The path can be either a single text file or a directory storing text files Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // The inferred schema can be visualized using the printSchema() method people.printSchema(); // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Creates a temporary view using the DataFrame people.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); namesDF.show(); // +------+ // | name| // +------+ // |Justin| // +------+ // Alternatively, a DataFrame can be created for a JSON dataset represented by // an RDD[String] storing one JSON object per string. List<String> jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = new JavaSparkContext(spark.sparkContext()).parallelize(jsonData); Dataset anotherPeople = spark.read().json(anotherPeopleRDD); anotherPeople.show(); // +---------------+----+ // | address|name| // +---------------+----+ // |[Columbus,Ohio]| Yin| // +---------------+----+ // $example off:json_dataset$ }
Example 6
Source File: XMLFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") .appName("JavaALSExample") .getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); HashMap<String, String> params = new HashMap<String, String>(); params.put("rowTag", "food"); params.put("failFast", "true"); Dataset<Row> docDF = sparkSession.read() .format("com.databricks.spark.xml") .options(params) .load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml"); docDF.printSchema(); docDF.show(); docDF.write().format("com.databricks.spark.xml") .option("rootTag", "food") .option("rowTag", "food") .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml"); }
Example 7
Source File: BookUrlBuilderApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("Book URL Builder") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING()); ds.printSchema(); ds.show(20, 80); }
Example 8
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runParquetSchemaMergingExample(SparkSession spark) { // $example on:schema_merging$ List<Square> squares = new ArrayList<>(); for (int value = 1; value <= 5; value++) { Square square = new Square(); square.setValue(value); square.setSquare(value * value); squares.add(square); } // Create a simple DataFrame, store into a partition directory Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class); squaresDF.write().parquet("data/test_table/key=1"); List<Cube> cubes = new ArrayList<>(); for (int value = 6; value <= 10; value++) { Cube cube = new Cube(); cube.setValue(value); cube.setCube(value * value * value); cubes.add(cube); } // Create another DataFrame in a new partition directory, // adding a new column and dropping an existing column Dataset<Row> cubesDF = spark.createDataFrame(cubes, Cube.class); cubesDF.write().parquet("data/test_table/key=2"); // Read the partitioned table Dataset<Row> mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table"); mergedDF.printSchema(); // The final schema consists of all 3 columns in the Parquet files together // with the partitioning column appeared in the partition directory paths // root // |-- value: int (nullable = true) // |-- square: int (nullable = true) // |-- cube: int (nullable = true) // |-- key: int (nullable = true) // $example off:schema_merging$ }
Example 9
Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args no input arguments * @throws IOException if custom report web service fails */ public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // retrieve PDB annotation: Binding affinities (Ki, Kd), // group name of the ligand (hetId), and the // Enzyme Classification number (ecNo) Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo"); // show the schema of this dataset ds.printSchema(); // select structures that either have a Ki or Kd value(s) and // are protein-serine/threonine kinases (EC 2.7.1.*): // A. by using dataset operations ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); // B. by creating a temporary query and running SQL ds.createOrReplaceTempView("table"); ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 10
Source File: SecondaryStructureWord2VecModelEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); if (args.length != 3) { System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // add Word2Vec encoded feature vector using // a pre-trained Word2Vec model read from file ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; String modelFileName = args[2]; data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 11
Source File: SecondaryStructureBlosum62Encoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add a property encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.blosum62Encode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 12
Source File: ProcessVendorTrasactions.java From aws-big-data-blog with Apache License 2.0 | 4 votes |
public static void run(String jobInputParam) throws Exception{ List<StructField> schemaFields = new ArrayList<StructField>(); schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(schemaFields); SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys"); SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); String redshiftJDBCURL=props.getProperty("redshift.jdbc.url"); String s3TempPath = props.getProperty("s3.temp.path"); System.out.println("props"+props); JavaRDD<Row> salesRDD = sc.textFile(jobInputParam). map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(","); return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}}); Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema); Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); String[] joinColArray = {"vendor_id","item_id","trans_date"}; vendorItemSaleAmountDF.printSchema(); Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq(); Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer") .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer") .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount"); vendorAggregatedDF.printSchema(); DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain(); AWSSessionCredentials creds = (AWSSessionCredentials) provider.getCredentials(); String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString(); String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp") .append(appendix) .append(" where vendortranssummary.vendor_id=vendortranssummary_temp") .append(appendix) .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp") .append(appendix) .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp") .append(appendix) .append(".trans_date;") .append("insert into vendortranssummary select * from vendortranssummary_temp") .append(appendix) .append(";drop table vendortranssummary_temp") .append(appendix) .append(";end transaction;").toString(); vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL) .option("dbtable", "vendortranssummary_temp"+appendix) .option("usestagingtable","false") .option("postactions",vendorTransSummarySQL) .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId()) .option("temporary_aws_secret_access_key",creds.getAWSSecretKey()) .option("temporary_aws_session_token", creds.getSessionToken()) .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save(); }
Example 13
Source File: ProteinFoldDatasetCreator.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 1) { System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(ProteinFoldDatasetCreator.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get secondary structure content Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb); // classify chains by secondary structure type double minThreshold = 0.05; double maxThreshold = 0.15; data = addProteinFoldType(data, minThreshold, maxThreshold); // create a binary classification dataset data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache(); // create a three-state classification model (alpha, beta, alpha+beta) // data = data.filter("foldType != 'other'").cache(); // add Word2Vec encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; int windowSize = 11; int vectorSize = 50; data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize); data.printSchema(); data.show(25); // keep only a subset of relevant fields for further processing data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features"); data.write().mode("overwrite").format("parquet").save(args[0]); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example 14
Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args no input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // get non-redundant subset pdb = pdb.filter(new Pisces(40, 2.5)); // find Zinc interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(20); // TODO print the top 10 interacting elements System.out.println("Top interacting elements"); Dataset<Row> topElements = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("element2") .count(); topElements.withColumn("frequency", col("count").divide(n)) .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(10); interactions .groupBy("element2") .avg("distance") .sort("avg(distance)") .show(10); // Aggregate multiple statistics // Note: import static org.apache.spark.sql.functions.* required! // e.g. org.apache.spark.sql.functions.avg // for a list of all available functions interactions .groupBy("element2") .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance")) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 15
Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args outputFilePath outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add a property encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.propertyEncode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 16
Source File: JsonFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") .appName("JavaALSExample") .getOrCreate(); RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class)); mapParser.foreach(t -> System.out.println(t)); Dataset<Row> anotherPeople = sparkSession.read().json(textFile); anotherPeople.printSchema(); anotherPeople.show(); Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json"); json_rec.printSchema(); json_rec.show(); StructType schema = new StructType( new StructField[] { DataTypes.createStructField("cid", DataTypes.IntegerType, true), DataTypes.createStructField("county", DataTypes.StringType, true), DataTypes.createStructField("firstName", DataTypes.StringType, true), DataTypes.createStructField("sex", DataTypes.StringType, true), DataTypes.createStructField("year", DataTypes.StringType, true), DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) }); /* StructType pep = new StructType(new StructField[] { new StructField("Count", DataTypes.StringType, true, Metadata.empty()), new StructField("County", DataTypes.StringType, true, Metadata.empty()), new StructField("First Name", DataTypes.StringType, true, Metadata.empty()), new StructField("Sex", DataTypes.StringType, true, Metadata.empty()), new StructField("Year", DataTypes.StringType, true, Metadata.empty()), new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/ Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile); person_mod.printSchema(); person_mod.show(); person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json"); }
Example 17
Source File: AuthorsAndBooksCountBooksApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
private void start() { SparkSession spark = SparkSession.builder() .appName("Authors and Books") .master("local").getOrCreate(); String filename = "data/authors.csv"; Dataset<Row> authorsDf = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); authorsDf.show(); authorsDf.printSchema(); filename = "data/books.csv"; Dataset<Row> booksDf = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); booksDf.show(); booksDf.printSchema(); Dataset<Row> libraryDf = authorsDf .join( booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "left") .withColumn("bookId", booksDf.col("id")) .drop(booksDf.col("id")) .groupBy( authorsDf.col("id"), authorsDf.col("name"), authorsDf.col("link")) .count(); libraryDf = libraryDf.orderBy(libraryDf.col("count").desc()); libraryDf.show(); libraryDf.printSchema(); }
Example 18
Source File: JReadPartitionAware.java From spark-data-sources with MIT License | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ExistingTableException, UnknownTableException { final String serverHost = "localhost"; final int serverPort = 50199; DBServer server = new DBServer(serverPort); server.start(); System.out.println("*** Example database server started"); // // Since this DataSource doesn't support writing, we need to populate // ExampleDB with some data. // Schema schema = new Schema(); schema.addColumn("g", Schema.ColumnType.STRING); schema.addColumn("u", Schema.ColumnType.INT64); DBClient client = new DBClient(serverHost, serverPort); // // Specify that the table is partitioned on column G // client.createTable("myTable", schema, "g"); List<edb.common.Row> toInsert = new ArrayList<>(); for (int i = 0; i < 20; i++) { edb.common.Row r = new edb.common.Row(); // // String column with four distinct values for clustering // r.addField(new edb.common.Row.StringField("g", "G_" + (i % 4))); r.addField(new edb.common.Row.Int64Field("u", i * 100)); toInsert.add(r); } client.bulkInsert("myTable", toInsert); System.out.println("*** Example database server populated with data"); // // By default this data source supports creating Datasets with four partitions. // String dataSourceName = "datasources.PartitioningRowDataSource"; SparkSession spark = SparkSession .builder() .appName("JReadPartitionAware") .master("local[4]") .getOrCreate(); // // This is where we read from our DataSource. Notice how we use the // fully qualified class name and provide the information needed to connect to // ExampleDB using options. We specify two partitions so that each can be expected // to contain two clusters. // Dataset<Row> data = spark.read() .format(dataSourceName) .option("host", serverHost) .option("port", serverPort) .option("table", "myTable") .option("partitions", 2) // number of partitions specified here .load(); System.out.println("*** Schema: "); data.printSchema(); System.out.println("*** Data: "); data.show(); RDDUtils.analyze(data); Dataset<Row> aggregated = data.groupBy(col("g")).agg(sum(col("u"))); System.out.println("*** Query result: "); aggregated.show(); RDDUtils.analyze(aggregated); spark.stop(); server.stop(); }
Example 19
Source File: UnionApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
/** * The processing code. * * @throws ParseException */ private void start() throws ParseException { // Creates a session on a local master SparkSession spark = SparkSession.builder() .appName("expr()") .master("local") .getOrCreate(); // DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd // HH:mm:ss", Locale.ENGLISH); // Data StructType dataSchema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField( "NAME", DataTypes.StringType, false), DataTypes.createStructField( "START_DATE", DataTypes.DateType, false), DataTypes.createStructField( "END_DATE", DataTypes.DateType, false), DataTypes.createStructField( "STATUS", DataTypes.StringType, false) }); List<Row> dataRows = new ArrayList<Row>(); dataRows.add(RowFactory.create("Alex", toDate("2018-01-01 00:00:00"), toDate("2018-02-01 00:00:00"), "OUT")); dataRows.add(RowFactory.create("Bob", toDate("2018-02-01 00:00:00"), toDate("2018-02-05 00:00:00"), "IN")); dataRows.add(RowFactory.create("Mark", toDate("2018-02-01 00:00:00"), toDate("2018-03-01 00:00:00"), "IN")); dataRows.add(RowFactory.create("Mark", toDate("2018-05-01 00:00:00"), toDate("2018-08-01 00:00:00"), "OUT")); dataRows.add(RowFactory.create("Meggy", toDate("2018-02-01 00:00:00"), toDate("2018-02-01 00:00:00"), "OUT")); Dataset<Row> dataDf = spark.createDataFrame(dataRows, dataSchema); dataDf.show(); dataDf.printSchema(); // Header StructType headerSchema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField( "_c1", DataTypes.StringType, false), DataTypes.createStructField( "_c2", DataTypes.StringType, false), DataTypes.createStructField( "_c3", DataTypes.StringType, false), DataTypes.createStructField( "_c4", DataTypes.StringType, false) }); List<Row> headerRows = new ArrayList<Row>(); headerRows.add(RowFactory.create("REQUEST_DATE", format.format(new java.util.Date()), "", "")); headerRows.add(RowFactory.create("USER", "Kate", "", "")); headerRows.add(RowFactory.create("SEARCH_TYPE", "Global", "", "")); headerRows.add(RowFactory.create("", "", "", "")); headerRows .add(RowFactory.create("NAME", "START_DATE", "END_DATE", "STATUS")); Dataset<Row> headerDf = spark.createDataFrame(headerRows, headerSchema); headerDf.show(false); headerDf.printSchema(); // Transition Dataset<Row> transitionDf = dataDf .withColumn("_c1", dataDf.col("NAME")) .withColumn("_c2", dataDf.col("START_DATE").cast(DataTypes.StringType)) .withColumn("_c3", dataDf.col("END_DATE").cast(DataTypes.StringType)) .withColumn("_c4", dataDf.col("STATUS").cast(DataTypes.StringType)) .drop("NAME") .drop("START_DATE") .drop("END_DATE") .drop("STATUS"); transitionDf.show(false); transitionDf.printSchema(); // Union Dataset<Row> unionDf = headerDf.unionByName(transitionDf); unionDf.show(false); unionDf.printSchema(); }
Example 20
Source File: JReadPartitionAware_Mismatch.java From spark-data-sources with MIT License | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ExistingTableException, UnknownTableException { final String serverHost = "localhost"; final int serverPort = 50199; DBServer server = new DBServer(serverPort); server.start(); System.out.println("*** Example database server started"); // // Since this DataSource doesn't support writing, we need to populate // ExampleDB with some data. // Schema schema = new Schema(); schema.addColumn("g", Schema.ColumnType.STRING); schema.addColumn("u", Schema.ColumnType.INT64); DBClient client = new DBClient(serverHost, serverPort); // // This time the table is not clustered on any column // client.createTable("myTable", schema); List<edb.common.Row> toInsert = new ArrayList<>(); for (int i = 0; i < 20; i++) { edb.common.Row r = new edb.common.Row(); // // String column with four distinct values for clustering // r.addField(new edb.common.Row.StringField("g", "G_" + (i % 4))); r.addField(new edb.common.Row.Int64Field("u", i * 100)); toInsert.add(r); } client.bulkInsert("myTable", toInsert); System.out.println("*** Example database server populated with data"); // // By default this data source supports creating Datasets with four partitions. // String dataSourceName = "datasources.PartitioningRowDataSource"; SparkSession spark = SparkSession .builder() .appName("JReadPartitionAware-Mismatch") .master("local[4]") .getOrCreate(); // // This is where we read from our DataSource. Notice how we use the // fully qualified class name and provide the information needed to connect to // ExampleDB using options. We specify two partitions so that each can be expected // to contain two clusters. But the table wasn't set up with the column clustered, so // a shuffle will be needed. // Dataset<Row> data = spark.read() .format(dataSourceName) .option("host", serverHost) .option("port", serverPort) .option("table", "myTable") .option("partitions", 2) // number of partitions specified here .load(); System.out.println("*** Schema: "); data.printSchema(); System.out.println("*** Data: "); data.show(); RDDUtils.analyze(data); Dataset<Row> aggregated = data.groupBy(col("g")).agg(sum(col("u"))); // // Note: since a shuffle was required, the resulting table has the usual default // number of partitions -- 200 as of Spark 2.3.0 // System.out.println("*** Query result: "); aggregated.show(); RDDUtils.analyze(aggregated); spark.stop(); server.stop(); }