org.apache.spark.sql.Dataset#coalesce

Source File: BatchStep.java From envelope with Apache License 2.0

5 votes

private Dataset<Row> repartition(Dataset<Row> data) {
  int numPartitions = 0;
  List<String> colPartitions = null;

  if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY);
  }

  if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) {
    colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY);
  }

  if (numPartitions > 0 && null != colPartitions) {
    data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions));
  }
  else if (numPartitions > 0) {
    data = data.repartition(numPartitions);
  }
  else if (null != colPartitions) {
    data = data.repartition(RowUtils.toColumnArray(colPartitions));
  }

  if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY);
    data = data.coalesce(numPartitions);
  }
  
  return data;
}

Source File: SecondaryStructureWord2VecEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// add Word2Vec encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	int windowSize = (segmentLength-1)/2;
	int vectorSize = 50;
	data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructureOneHotEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length < 2) {
		System.err.println("Usage: " + SecondaryStructureOneHotEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + [<modelFileName>]");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureOneHotEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add one-hot encoded sequence feature vector to dataset
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.oneHotEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructureShiftedWord2VecEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
	// of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
			.filter(new Pisces(sequenceIdentity, resolution));

	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// create a Word2Vector representation of the protein sequences
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int windowSize = (segmentLength-1)/2;
	int vectorSize = 50; // dimension of feature vector	(50)
	data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();

	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructureElementsWord2VecEncoder.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {

		String path = MmtfReader.getMmtfReducedPath();
	    
		if (args.length != 0 && args.length != 2) {
			System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
			System.exit(1);
		}


		long start = System.nanoTime();

		SparkConf conf = new SparkConf()
				.setMaster("local[*]")
				.setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		// read MMTF Hadoop sequence file and create a non-redundant Pisces 
		// subset set (<=20% seq. identity) of L-protein chains
		int sequenceIdentity = 20;
		double resolution = 3.0;
		
		JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
				.readSequenceFile(path, sc)
				.flatMapToPair(new StructureToPolymerChains())
                .filter(new Pisces(sequenceIdentity, resolution));
			
		int segmentLength = 11;
		
		// extract helical sequence segments
		Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
		System.out.println(data.count());
		data.show(10,false);
		
		// add Word2Vec encoded feature vector
		ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
		int n = 2;
		int windowSize = (segmentLength-1)/2;
		int vectorSize = 50;
		data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	
		data.show(50,false);
		
		// optionally, save results
		if (args.length > 0) {
			if (args[1].equals("json")) {
				// coalesce data into a single file
				data = data.coalesce(1);
			}
			data.write().mode("overwrite").format(args[1]).save(args[0]);
		}
		
		long end = System.nanoTime();

		System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
	}

Source File: SecondaryStructureWord2VecModelEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
	if (args.length != 3) {
		System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// add Word2Vec encoded feature vector using
	// a pre-trained Word2Vec model read from file
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	String modelFileName = args[2];
	data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructureBlosum62Encoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.blosum62Encode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.propertyEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Java Code Examples for org.apache.spark.sql.Dataset#coalesce()