org.apache.spark.sql.Dataset#printSchema

Source File: SwissModelDataset.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Flattens the original hierarchical data schema into a simple row-based
 * schema. Some less useful data are excluded.
 * 
 * @param original
 *            hierarchical dataset
 * @return flattened dataset
 */
private static Dataset<Row> flattenDataset(Dataset<Row> ds) {
    ds.printSchema();
    return ds.withColumn("structures", explode(ds.col("result.structures"))).select(col("query.ac"),
            col("result.sequence"), col("structures.from"), col("structures.to"), 
            col("structures.qmean"), col("structures.qmean_norm"), col("structures.gmqe"),
            col("structures.coverage"), col("structures.oligo-state"), col("structures.method"),
            col("structures.template"), col("structures.identity"), col("structures.similarity"),
            col("structures.coordinates"), col("result.md5"), col("structures.md5"));
}

Source File: DSSPDemo.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    Dataset<Row> ds = SecondaryStructureExtractor.getDataset(pdb);
    
    // show the schema of this dataset
    ds.printSchema();
    ds.show(2, false);

    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: ReadLinesFromFileStream.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() throws TimeoutException {
  log.debug("-> start()");

  SparkSession spark = SparkSession.builder()
      .appName("Read lines over a file stream")
      .master("local")
      .getOrCreate();

  Dataset<Row> df = spark
      .readStream()
      .format("text")
      .load(StreamingUtils.getInputDirectory());

  StreamingQuery query = df
      .writeStream()
      .outputMode(OutputMode.Update())
      .format("console")
      .start();

  try {
    query.awaitTermination();
  } catch (StreamingQueryException e) {
    log.error(
        "Exception while waiting for query to end {}.",
        e.getMessage(),
        e);
  }

  // Never executed
  df.show();
  df.printSchema();
}

Source File: ReducerApp.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().master("local").getOrCreate();

  List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
  Dataset<Integer> df = spark.createDataset(data, Encoders.INT());
  df.show();
  df.printSchema();
  Integer sumByReduce = df.reduce(new SumByReduce());
  System.out.println("Sum should be 55 and it is... " + sumByReduce);
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runJsonDatasetExample(SparkSession spark) {
  // $example on:json_dataset$
  // A JSON dataset is pointed to by path.
  // The path can be either a single text file or a directory storing text files
  Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // The inferred schema can be visualized using the printSchema() method
  people.printSchema();
  // root
  //  |-- age: long (nullable = true)
  //  |-- name: string (nullable = true)

  // Creates a temporary view using the DataFrame
  people.createOrReplaceTempView("people");

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
  namesDF.show();
  // +------+
  // |  name|
  // +------+
  // |Justin|
  // +------+

  // Alternatively, a DataFrame can be created for a JSON dataset represented by
  // an RDD[String] storing one JSON object per string.
  List<String> jsonData = Arrays.asList(
          "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
  JavaRDD<String> anotherPeopleRDD =
          new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
  Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
  anotherPeople.show();
  // +---------------+----+
  // |        address|name|
  // +---------------+----+
  // |[Columbus,Ohio]| Yin|
  // +---------------+----+
  // $example off:json_dataset$
}

Source File: XMLFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) {
	 System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
		
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
      .appName("JavaALSExample")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN); 

	
	HashMap<String, String> params = new HashMap<String, String>();
	params.put("rowTag", "food");
	params.put("failFast", "true");
	 Dataset<Row> docDF = sparkSession.read()
			                   .format("com.databricks.spark.xml")
			                   .options(params)
			                   .load("C:/Users/sumit.kumar/git/learning/src/main/resources/breakfast_menu.xml");
	 
	 docDF.printSchema();		 
	 docDF.show();
	 
	 docDF.write().format("com.databricks.spark.xml")
	    .option("rootTag", "food")
	    .option("rowTag", "food")
	    .save("C:/Users/sumit.kumar/git/learning/src/main/resources/newMenu.xml");

}

Source File: BookUrlBuilderApp.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("Book URL Builder")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
  ds.printSchema();
  ds.show(20, 80);
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runParquetSchemaMergingExample(SparkSession spark) {
  // $example on:schema_merging$
  List<Square> squares = new ArrayList<>();
  for (int value = 1; value <= 5; value++) {
    Square square = new Square();
    square.setValue(value);
    square.setSquare(value * value);
    squares.add(square);
  }

  // Create a simple DataFrame, store into a partition directory
  Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class);
  squaresDF.write().parquet("data/test_table/key=1");

  List<Cube> cubes = new ArrayList<>();
  for (int value = 6; value <= 10; value++) {
    Cube cube = new Cube();
    cube.setValue(value);
    cube.setCube(value * value * value);
    cubes.add(cube);
  }

  // Create another DataFrame in a new partition directory,
  // adding a new column and dropping an existing column
  Dataset<Row> cubesDF = spark.createDataFrame(cubes, Cube.class);
  cubesDF.write().parquet("data/test_table/key=2");

  // Read the partitioned table
  Dataset<Row> mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table");
  mergedDF.printSchema();

  // The final schema consists of all 3 columns in the Parquet files together
  // with the partitioning column appeared in the partition directory paths
  // root
  //  |-- value: int (nullable = true)
  //  |-- square: int (nullable = true)
  //  |-- cube: int (nullable = true)
  //  |-- key: int (nullable = true)
  // $example off:schema_merging$
}

Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args no input arguments
 * @throws IOException if custom report web service fails
 */
public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
   
    // retrieve PDB annotation: Binding affinities (Ki, Kd), 
    // group name of the ligand (hetId), and the 
    // Enzyme Classification number (ecNo)
    Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo");
    
    // show the schema of this dataset
    ds.printSchema();
        
    // select structures that either have a Ki or Kd value(s) and
    // are protein-serine/threonine kinases (EC 2.7.1.*):
    
    // A. by using dataset operations
    ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
     
    // B. by creating a temporary query and running SQL
    ds.createOrReplaceTempView("table");
    ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
    
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: SecondaryStructureWord2VecModelEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
	if (args.length != 3) {
		System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// add Word2Vec encoded feature vector using
	// a pre-trained Word2Vec model read from file
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	String modelFileName = args[2];
	data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructureBlosum62Encoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.blosum62Encode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: ProcessVendorTrasactions.java From aws-big-data-blog with Apache License 2.0

4 votes

public static void run(String jobInputParam) throws Exception{
	
   	List<StructField> schemaFields = new ArrayList<StructField>();
   	schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true));
   	StructType schema = DataTypes.createStructType(schemaFields);

   	SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys");
   	SparkSession spark = SparkSession.builder().config(conf).getOrCreate();	
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
			
	String redshiftJDBCURL=props.getProperty("redshift.jdbc.url");
	String s3TempPath = props.getProperty("s3.temp.path");
	System.out.println("props"+props);
	
	JavaRDD<Row> salesRDD = sc.textFile(jobInputParam).
			map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(",");
		      return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}});
	Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema);
	Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	String[] joinColArray = {"vendor_id","item_id","trans_date"};
	vendorItemSaleAmountDF.printSchema();
	Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq();

	Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer")
							 .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer")
							 .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount");
	
	vendorAggregatedDF.printSchema();
	DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain();
	AWSSessionCredentials creds  = (AWSSessionCredentials) provider.getCredentials();
	
	String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString();
	String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp")
			 .append(appendix)
			 .append(" where vendortranssummary.vendor_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp")
			 .append(appendix)
			 .append(".trans_date;")
			 .append("insert into vendortranssummary select * from vendortranssummary_temp")
			 .append(appendix)
			 .append(";drop table vendortranssummary_temp")
			 .append(appendix)
			 .append(";end transaction;").toString();
	vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL)
    .option("dbtable", "vendortranssummary_temp"+appendix)
    .option("usestagingtable","false")
    .option("postactions",vendorTransSummarySQL)
    .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId())
    .option("temporary_aws_secret_access_key",creds.getAWSSecretKey())
    .option("temporary_aws_session_token", creds.getSessionToken())
    .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save();
		
}

Source File: ProteinFoldDatasetCreator.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 1) {
		System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(ProteinFoldDatasetCreator.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));

	// get secondary structure content
	Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb);

	// classify chains by secondary structure type
	double minThreshold = 0.05;
	double maxThreshold = 0.15;
    	data = addProteinFoldType(data, minThreshold, maxThreshold);
    	
    	// create a binary classification dataset
	data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache();

	// create a three-state classification model (alpha, beta, alpha+beta)
	//		data = data.filter("foldType != 'other'").cache();

	// add Word2Vec encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	int windowSize = 11;
	int vectorSize = 50;
	data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	

	data.printSchema();
	data.show(25);
	
	// keep only a subset of relevant fields for further processing
       data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features");

       data.write().mode("overwrite").format("parquet").save(args[0]);
	
	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}

Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.propertyEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: JsonFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN); 
	      SparkSession sparkSession = SparkSession
	      .builder()
	      .master("local")
		  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
	      .appName("JavaALSExample")
	      .getOrCreate();
	      
	   RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); 
	   
	   JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
	   
	   mapParser.foreach(t -> System.out.println(t)); 
	  
	   Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
	   
	   anotherPeople.printSchema();
	   anotherPeople.show();
	      
	      
	      Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
	      json_rec.printSchema();
	      
	      json_rec.show();
	      
	      StructType schema = new StructType( new StructField[] {
	    	            DataTypes.createStructField("cid", DataTypes.IntegerType, true),
	    	            DataTypes.createStructField("county", DataTypes.StringType, true),
	    	            DataTypes.createStructField("firstName", DataTypes.StringType, true),
	    	            DataTypes.createStructField("sex", DataTypes.StringType, true),
	    	            DataTypes.createStructField("year", DataTypes.StringType, true),
	    	            DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
	      
	    /*  StructType pep = new StructType(new StructField[] {
					new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
					new StructField("County", DataTypes.StringType, true, Metadata.empty()),
					new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
				    new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
	      
	     Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
	     
	     person_mod.printSchema();
	     person_mod.show();
	     
	     person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");

}

Source File: AuthorsAndBooksCountBooksApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Authors and Books")
      .master("local").getOrCreate();

  String filename = "data/authors.csv";
  Dataset<Row> authorsDf = spark.read()
      .format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  authorsDf.show();
  authorsDf.printSchema();

  filename = "data/books.csv";
  Dataset<Row> booksDf = spark.read()
      .format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  booksDf.show();
  booksDf.printSchema();

  Dataset<Row> libraryDf = authorsDf
      .join(
          booksDf,
          authorsDf.col("id").equalTo(booksDf.col("authorId")),
          "left")
      .withColumn("bookId", booksDf.col("id"))
      .drop(booksDf.col("id"))
      .groupBy(
          authorsDf.col("id"),
          authorsDf.col("name"),
          authorsDf.col("link"))
      .count();

  libraryDf = libraryDf.orderBy(libraryDf.col("count").desc());

  libraryDf.show();
  libraryDf.printSchema();
}

Source File: JReadPartitionAware.java From spark-data-sources with MIT License

4 votes

public static void main(String[] args)
        throws IOException, InterruptedException,
        ExistingTableException, UnknownTableException
{

    final String serverHost = "localhost";
    final int serverPort = 50199;

    DBServer server = new DBServer(serverPort);
    server.start();

    System.out.println("*** Example database server started");

    //
    // Since this DataSource doesn't support writing, we need to populate
    // ExampleDB with some data.
    //

    Schema schema = new Schema();
    schema.addColumn("g", Schema.ColumnType.STRING);
    schema.addColumn("u", Schema.ColumnType.INT64);


    DBClient client = new DBClient(serverHost, serverPort);
    //
    // Specify that the table is partitioned on column G
    //
    client.createTable("myTable", schema, "g");

    List<edb.common.Row> toInsert = new ArrayList<>();
    for (int i = 0; i < 20; i++) {
        edb.common.Row r = new edb.common.Row();
        //
        // String column with four distinct values for clustering
        //
        r.addField(new edb.common.Row.StringField("g", "G_" + (i % 4)));
        r.addField(new edb.common.Row.Int64Field("u", i * 100));

        toInsert.add(r);
    }

    client.bulkInsert("myTable", toInsert);

    System.out.println("*** Example database server populated with data");

    //
    // By default this data source supports creating Datasets with four partitions.
    //
    String dataSourceName = "datasources.PartitioningRowDataSource";

    SparkSession spark = SparkSession
            .builder()
            .appName("JReadPartitionAware")
            .master("local[4]")
            .getOrCreate();

    //
    // This is where we read from our DataSource. Notice how we use the
    // fully qualified class name and provide the information needed to connect to
    // ExampleDB using options. We specify two partitions so that each can be expected
    // to contain two clusters.
    //
    Dataset<Row> data = spark.read()
            .format(dataSourceName)
            .option("host", serverHost)
            .option("port", serverPort)
            .option("table", "myTable")
            .option("partitions", 2) // number of partitions specified here
            .load();

    System.out.println("*** Schema: ");
    data.printSchema();

    System.out.println("*** Data: ");
    data.show();

    RDDUtils.analyze(data);

    Dataset<Row> aggregated = data.groupBy(col("g")).agg(sum(col("u")));

    System.out.println("*** Query result: ");
    aggregated.show();

    RDDUtils.analyze(aggregated);

    spark.stop();

    server.stop();
}

Source File: UnionApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

/**
 * The processing code.
 * 
 * @throws ParseException
 */
private void start() throws ParseException {
  // Creates a session on a local master
  SparkSession spark = SparkSession.builder()
      .appName("expr()")
      .master("local")
      .getOrCreate();

  // DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd
  // HH:mm:ss", Locale.ENGLISH);

  // Data
  StructType dataSchema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "NAME",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "START_DATE",
          DataTypes.DateType,
          false),
      DataTypes.createStructField(
          "END_DATE",
          DataTypes.DateType,
          false),
      DataTypes.createStructField(
          "STATUS",
          DataTypes.StringType,
          false) });
  List<Row> dataRows = new ArrayList<Row>();
  dataRows.add(RowFactory.create("Alex", toDate("2018-01-01 00:00:00"),
      toDate("2018-02-01 00:00:00"), "OUT"));
  dataRows.add(RowFactory.create("Bob", toDate("2018-02-01 00:00:00"),
      toDate("2018-02-05 00:00:00"), "IN"));
  dataRows.add(RowFactory.create("Mark", toDate("2018-02-01 00:00:00"),
      toDate("2018-03-01 00:00:00"), "IN"));
  dataRows.add(RowFactory.create("Mark", toDate("2018-05-01 00:00:00"),
      toDate("2018-08-01 00:00:00"), "OUT"));
  dataRows.add(RowFactory.create("Meggy", toDate("2018-02-01 00:00:00"),
      toDate("2018-02-01 00:00:00"), "OUT"));
  Dataset<Row> dataDf = spark.createDataFrame(dataRows, dataSchema);
  dataDf.show();
  dataDf.printSchema();

  // Header
  StructType headerSchema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "_c1",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c2",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c3",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c4",
          DataTypes.StringType,
          false) });
  List<Row> headerRows = new ArrayList<Row>();
  headerRows.add(RowFactory.create("REQUEST_DATE",
      format.format(new java.util.Date()), "", ""));
  headerRows.add(RowFactory.create("USER", "Kate", "", ""));
  headerRows.add(RowFactory.create("SEARCH_TYPE", "Global", "", ""));
  headerRows.add(RowFactory.create("", "", "", ""));
  headerRows
      .add(RowFactory.create("NAME", "START_DATE", "END_DATE", "STATUS"));
  Dataset<Row> headerDf = spark.createDataFrame(headerRows, headerSchema);
  headerDf.show(false);
  headerDf.printSchema();

  // Transition
  Dataset<Row> transitionDf = dataDf
      .withColumn("_c1", dataDf.col("NAME"))
      .withColumn("_c2",
          dataDf.col("START_DATE").cast(DataTypes.StringType))
      .withColumn("_c3",
          dataDf.col("END_DATE").cast(DataTypes.StringType))
      .withColumn("_c4", dataDf.col("STATUS").cast(DataTypes.StringType))
      .drop("NAME")
      .drop("START_DATE")
      .drop("END_DATE")
      .drop("STATUS");
  transitionDf.show(false);
  transitionDf.printSchema();

  // Union
  Dataset<Row> unionDf = headerDf.unionByName(transitionDf);
  unionDf.show(false);
  unionDf.printSchema();
}

Source File: JReadPartitionAware_Mismatch.java From spark-data-sources with MIT License

4 votes

public static void main(String[] args)
        throws IOException, InterruptedException,
        ExistingTableException, UnknownTableException
{

    final String serverHost = "localhost";
    final int serverPort = 50199;

    DBServer server = new DBServer(serverPort);
    server.start();

    System.out.println("*** Example database server started");

    //
    // Since this DataSource doesn't support writing, we need to populate
    // ExampleDB with some data.
    //

    Schema schema = new Schema();
    schema.addColumn("g", Schema.ColumnType.STRING);
    schema.addColumn("u", Schema.ColumnType.INT64);


    DBClient client = new DBClient(serverHost, serverPort);
    //
    // This time the table is not clustered on any column
    //
    client.createTable("myTable", schema);

    List<edb.common.Row> toInsert = new ArrayList<>();
    for (int i = 0; i < 20; i++) {
        edb.common.Row r = new edb.common.Row();
        //
        // String column with four distinct values for clustering
        //
        r.addField(new edb.common.Row.StringField("g", "G_" + (i % 4)));
        r.addField(new edb.common.Row.Int64Field("u", i * 100));

        toInsert.add(r);
    }

    client.bulkInsert("myTable", toInsert);

    System.out.println("*** Example database server populated with data");

    //
    // By default this data source supports creating Datasets with four partitions.
    //
    String dataSourceName = "datasources.PartitioningRowDataSource";

    SparkSession spark = SparkSession
            .builder()
            .appName("JReadPartitionAware-Mismatch")
            .master("local[4]")
            .getOrCreate();

    //
    // This is where we read from our DataSource. Notice how we use the
    // fully qualified class name and provide the information needed to connect to
    // ExampleDB using options. We specify two partitions so that each can be expected
    // to contain two clusters. But the table wasn't set up with the column clustered, so
    // a shuffle will be needed.
    //
    Dataset<Row> data = spark.read()
            .format(dataSourceName)
            .option("host", serverHost)
            .option("port", serverPort)
            .option("table", "myTable")
            .option("partitions", 2) // number of partitions specified here
            .load();

    System.out.println("*** Schema: ");
    data.printSchema();

    System.out.println("*** Data: ");
    data.show();

    RDDUtils.analyze(data);

    Dataset<Row> aggregated = data.groupBy(col("g")).agg(sum(col("u")));

    //
    // Note: since a shuffle was required, the resulting table has the usual default
    // number of partitions -- 200 as of Spark 2.3.0
    //
    System.out.println("*** Query result: ");
    aggregated.show();

    RDDUtils.analyze(aggregated);

    spark.stop();

    server.stop();
}

Java Code Examples for org.apache.spark.sql.Dataset#printSchema()