Java Code Examples for org.apache.spark.sql.Dataset#coalesce()
The following examples show how to use
org.apache.spark.sql.Dataset#coalesce() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BatchStep.java From envelope with Apache License 2.0 | 5 votes |
private Dataset<Row> repartition(Dataset<Row> data) { int numPartitions = 0; List<String> colPartitions = null; if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY); } if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) { colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY); } if (numPartitions > 0 && null != colPartitions) { data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions)); } else if (numPartitions > 0) { data = data.repartition(numPartitions); } else if (null != colPartitions) { data = data.repartition(RowUtils.toColumnArray(colPartitions)); } if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY); data = data.coalesce(numPartitions); } return data; }
Example 2
Source File: SecondaryStructureWord2VecEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args outputFilePath outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructureWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // add Word2Vec encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; int windowSize = (segmentLength-1)/2; int vectorSize = 50; data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 3
Source File: SecondaryStructureOneHotEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length < 2) { System.err.println("Usage: " + SecondaryStructureOneHotEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + [<modelFileName>]"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureOneHotEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add one-hot encoded sequence feature vector to dataset ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.oneHotEncode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 4
Source File: SecondaryStructureShiftedWord2VecEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity) // of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // create a Word2Vector representation of the protein sequences ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int windowSize = (segmentLength-1)/2; int vectorSize = 50; // dimension of feature vector (50) data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 5
Source File: SecondaryStructureElementsWord2VecEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 0 && args.length != 2) { System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); int segmentLength = 11; // extract helical sequence segments Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength); System.out.println(data.count()); data.show(10,false); // add Word2Vec encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; int windowSize = (segmentLength-1)/2; int vectorSize = 50; data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize); data.show(50,false); // optionally, save results if (args.length > 0) { if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); } long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 6
Source File: SecondaryStructureWord2VecModelEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); if (args.length != 3) { System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // add Word2Vec encoded feature vector using // a pre-trained Word2Vec model read from file ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; String modelFileName = args[2]; data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 7
Source File: SecondaryStructureBlosum62Encoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add a property encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.blosum62Encode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 8
Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args outputFilePath outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add a property encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.propertyEncode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }