Java Code Examples for org.apache.spark.sql.Dataset#count()
The following examples show how to use
org.apache.spark.sql.Dataset#count() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep"); return dataset; } String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + "."); dataset = dataset.filter(query); dataset.cache(); if(dataset.count() == 0) { BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query."); System.exit(1); } return dataset; }
Example 2
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type) throws Exception { System.out.println("*********************************"); System.out.println("T14: Append mode " + containerOut); String o1 = containerOut + "myData"; try { createAppendObject("T14 - first append", schemaFlights, o1, type); long baseCount = schemaFlights.count(); System.out .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type); readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1); createAppendObject("T14 - second append", schemaFlights, o1, type); baseCount = schemaFlights.count(); System.out .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type); readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2); } catch (Exception e) { throw e; } finally { deleteData(o1, spark.sparkContext().hadoopConfiguration(), true); } }
Example 3
Source File: LoopStep.java From envelope with Apache License 2.0 | 6 votes |
private List<Row> getRowsFromStep(Set<Step> steps) { String stepName = config.getString(STEP_PROPERTY); Optional<Step> optionalStep = StepUtils.getStepForName(stepName, steps); if (!optionalStep.isPresent()) { throw new RuntimeException("Step source for loop step '" + getName() + "' does not exist."); } Step step = optionalStep.get(); if (!(step instanceof DataStep)) { throw new RuntimeException("Step source for loop step '" + getName() + "' is not a data step."); } Dataset<Row> stepRows = ((DataStep)step).getData(); if (stepRows.count() > 1000) { throw new RuntimeException("Step source for loop step '" + getName() + "' can not provide more than 1000 values to loop over"); } return stepRows.collectAsList(); }
Example 4
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test13(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type) throws Exception { System.out.println("*********************************"); System.out.println("T13: Going to create nested structures and check globber on " + containerOut); String o1 = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64d"; try { if (dataCreate) { createObject("T13", schemaFlights, o1, type); } long baseCount = schemaFlights.count(); String path = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64"; System.out.println( "***T13-1 : Reading " + path + " from " + containerOut + ", base unit " + baseCount + " type " + type); readAndTest("T13-1-" + type, type, path, spark, baseCount, 1); } catch (Exception e) { throw e; } finally { deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate); } }
Example 5
Source File: CountDatasetRule.java From envelope with Apache License 2.0 | 6 votes |
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { if (isDependency()) { Dataset<Row> expectedDependency = stepDependencies.get(dependency); if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1 && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) { expected = expectedDependency.collectAsList().get(0).getLong(0); } else { throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type"); } } if (expected < 0) { throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency"); } return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA)); }
Example 6
Source File: ConceptMaps.java From bunsen with Apache License 2.0 | 6 votes |
@Override public ConceptMaps withConceptMaps(Dataset<Row> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. // Dataset<Row> withoutConcepts = JavaRDD<Row> withoutConceptsRdd = conceptMaps.javaRDD().map(new ConceptMapRemover()); Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd, conceptMapRowConverter.getSchema()); Dataset<Mapping> newMappings = conceptMaps.flatMap((Row row) -> expandMappingsIterator(row), MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
Example 7
Source File: ValueSets.java From bunsen with Apache License 2.0 | 5 votes |
/** * Returns a new ValueSets instance that includes the given value sets. * * @param valueSets the value sets to add to the returned collection. * @return a new ValueSets instance with the added value sets. */ @Override public ValueSets withValueSets(Dataset<Row> valueSets) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets); // Ensure that there are no duplicates among the value sets if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add value sets having duplicate valueSetUri and valueSetVersion"); } JavaRDD<Row> valueSetsRdd = valueSets.javaRDD(); // The value set concepts will be stored in the values table for persistence, so we remove // them from the individual value sets. This can be done most easily by setting concepts to an // empty list. JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion)); Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd, valueSetRowConverter.getSchema()); JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion)); Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder()); return withValueSets(withoutConcepts, newValues); }
Example 8
Source File: TestPerformanceRegression.java From chronix.spark with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws SolrServerException, IOException { ChronixSparkLoader loader = new ChronixSparkLoader(); ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext(); SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext()); // BENCHMARK START ............................... long start = System.currentTimeMillis(); for (int i = 0; i < LOOPS; i++) { //Load data into ChronixRDD ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext); //Some actions double mean = rdd.mean(); double approxMean = rdd.approxMean(); long observationCount = rdd.countObservations(); double max = rdd.max(); double min = rdd.min(); Iterator<MetricTimeSeries> it = rdd.iterator(); while (it.hasNext()) { MetricTimeSeries mts = it.next(); System.out.print("."); } //DataFrame operations Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext); ds.count(); } long stop = System.currentTimeMillis(); // BENCHMARK STOP ................................... System.out.println("\nBenchmark duration: " + (stop - start) + " ms"); chronixSparkContext.getSparkContext().close(); }
Example 9
Source File: TestSuite.java From stocator with Apache License 2.0 | 5 votes |
private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception { long totalInSpark = inSpark.count(); if (totalInSpark != readRecords) { System.out.println("*********************************"); System.out.println(msg + ": Records that were written into object store doesn't match"); System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark); throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark); } else { System.out.println( msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark); } }
Example 10
Source File: DecisionStep.java From envelope with Apache License 2.0 | 5 votes |
private boolean evaluateStepByKeyDecision(Set<Step> steps) { Optional<Step> optionalStep = StepUtils.getStepForName(stepByKeyStepName, steps); if (!optionalStep.isPresent()) { throw new RuntimeException("Unknown decision step's key step: " + stepByValueStepName); } if (!(optionalStep.get() instanceof DataStep)) { throw new RuntimeException("Decision step's key step is not a data step: " + optionalStep.get().getName()); } Dataset<Row> keyDataset = ((DataStep)optionalStep.get()).getData(); if (keyDataset.schema().fields().length != 2 || keyDataset.schema().fields()[0].dataType() != DataTypes.StringType || keyDataset.schema().fields()[1].dataType() != DataTypes.BooleanType) { throw new RuntimeException("Decision step's key step must contain a string column and then a boolean column"); } String keyColumnName = keyDataset.schema().fieldNames()[0]; String whereClause = keyColumnName + " = '" + stepByKeyKey + "'"; Dataset<Row> decisionDataset = keyDataset.where(whereClause); if (decisionDataset.count() != 1) { throw new RuntimeException("Decision step's key step must contain a single record for the given key"); } boolean decision = decisionDataset.collectAsList().get(0).getBoolean(1); return decision; }
Example 11
Source File: ConceptMaps.java From bunsen with Apache License 2.0 | 5 votes |
@Override public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. Dataset<ConceptMap> withoutConcepts = conceptMaps .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> { // Remove the elements rather than the groups to preserved the // "unmapped" structure in a group that can refer to other // concept maps. ConceptMap withoutElements = conceptMap.copy(); List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>(); for (ConceptMapGroupComponent group: withoutElements.getGroup()) { group.setElement(new ArrayList<>()); updatedGroups.add(group); } withoutElements.setGroup(updatedGroups); return withoutElements; }, CONCEPT_MAP_ENCODER); Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator, MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
Example 12
Source File: ValueSets.java From bunsen with Apache License 2.0 | 5 votes |
/** * Returns a new ValueSets instance that includes the given value sets. * * @param valueSets the value sets to add to the returned collection. * @return a new ValueSets instance with the added value sets. */ @Override public ValueSets withValueSets(Dataset<ValueSet> valueSets) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets); // Ensure that there are no duplicates among the value sets if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add value sets having duplicate valueSetUri and valueSetVersion"); } // The value set concepts will be stored in the values table for persistence, so we remove // them from the individual value sets. This can be done most easily by setting concepts to an // empty list. Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> { ValueSet valueSetWithoutConcepts = valueSet.copy(); List<ConceptSetComponent> updatedInclusions = new ArrayList<>(); for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) { ConceptSetComponent inclusionWithoutConcepts = inclusion.copy(); inclusionWithoutConcepts.setConcept(new ArrayList<>()); updatedInclusions.add(inclusionWithoutConcepts); } valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions); return valueSetWithoutConcepts; }, VALUE_SET_ENCODER); Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator, getValueEncoder()); return withValueSets(withoutConcepts, newValues); }
Example 13
Source File: MetroAnalysisJob.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 数据逻辑处理 * @param sparkContext * @param inPutPath * @param outPutPath */ private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) { SparkJobUtil.checkFileExists(inPutPath); SQLContext sqlContext = new SQLContext(sparkContext); // sqlContext.setConf("spark.sql.parquet.binaryAsString","true"); //创建快照临时表 Dataset<Row> dataset = sqlContext.read().json(inPutPath); dataset.registerTempTable("hui_metro_testjson"); dataset.show(10); Dataset<Row> resultFrame = sqlContext.sql(SQL); if (resultFrame.count() > 0) { resultFrame.repartition(3).write() .mode(SaveMode.Append).json(outPutPath); } resultFrame.show(10); //结果写入数据库 MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig(); jdbcConfig.init(); resultFrame.write().mode("append") .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties()); }
Example 14
Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
private void queryTest(CubeSegment segment) { // Result cmp: Parquet vs Spark SQL for (LayoutEntity entity : MetadataConverter.extractEntityList2JavaList(segment.getCubeInstance())) { // Parquet result Dataset<Row> layoutDataset = StorageFactory.createEngineAdapter(new IStorageAware() { // Hardcode @Override public int getStorageType() { return 4; } }, NSparkCubingEngine.NSparkCubingStorage.class) .getFrom(PathManager.getParquetStoragePath(segment.getConfig(), segment.getCubeInstance().getName(), segment.getName(), segment.getStorageLocationIdentifier(), String.valueOf(entity.getId())), ss); Set<Integer> measures = new HashSet<Integer>(); Set<Integer> rowKeys = entity.getOrderedDimensions().keySet(); for (Map.Entry<Integer, FunctionDesc> entry : entity.getOrderedMeasures().entrySet()) { String type = entry.getValue().returnType().dataType(); if (type.equals("hllc") || type.equals("topn") || type.equals("percentile")) { continue; } measures.add(entry.getKey()); } layoutDataset = layoutDataset.select(NSparkCubingUtil.getColumns(rowKeys, measures)) .sort(NSparkCubingUtil.getColumns(rowKeys)); System.out.println("Query cuboid ------------ " + entity.getId()); layoutDataset = dsConvertToOriginal(layoutDataset, entity); layoutDataset.show(10); // Spark sql Dataset<Row> ds = initFlatTable(segment); if (!entity.isTableIndex()) { ds = CuboidAggregator.agg(ss, ds, entity.getOrderedDimensions().keySet(), entity.getOrderedMeasures(), null, true); } Dataset<Row> exceptDs = ds.select(NSparkCubingUtil.getColumns(rowKeys, measures)) .sort(NSparkCubingUtil.getColumns(rowKeys)); System.out.println("Spark sql ------------ "); exceptDs.show(10); long layoutCount = layoutDataset.count(); long expectCount = exceptDs.count(); Assert.assertEquals(layoutCount, expectCount); String msg = SparkQueryTest.checkAnswer(layoutDataset, exceptDs, false); Assert.assertNull(msg); } }
Example 15
Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0 | 4 votes |
@Test public void testCreateDataFrame() throws Exception { // Set up Spark final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession(); final SparkContext context = session.sparkContext(); // ingest test points TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); final SqlQueryRunner queryRunner = new SqlQueryRunner(); queryRunner.setSparkSession(session); try { // Load RDD from datastore, no filters final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions()); final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD(); final long count = javaRdd.count(); LOGGER.warn("DataStore loaded into RDD with " + count + " features."); queryRunner.addInputStore(dataStore, null, "features"); final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))"; queryRunner.setSql( "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)"); Dataset<Row> results = queryRunner.run(); final long containsCount = results.count(); LOGGER.warn("Got " + containsCount + " for GeomContains test"); queryRunner.setSql( "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))"); results = queryRunner.run(); final long withinCount = results.count(); LOGGER.warn("Got " + withinCount + " for GeomWithin test"); Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount); // Test the output writer final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore); sqlResultsWriter.writeResults("sqltest"); queryRunner.removeAllStores(); // Test other spatial UDFs final String line1 = "LINESTRING(0 0, 10 10)"; final String line2 = "LINESTRING(0 10, 10 0)"; queryRunner.setSql( "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); Row result = queryRunner.run().head(); final boolean intersect = result.getBoolean(0); LOGGER.warn("GeomIntersects returned " + intersect); Assert.assertTrue("Lines should intersect", intersect); queryRunner.setSql( "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))"); result = queryRunner.run().head(); final boolean disjoint = result.getBoolean(0); LOGGER.warn("GeomDisjoint returned " + disjoint); Assert.assertFalse("Lines should not be disjoint", disjoint); } catch (final Exception e) { e.printStackTrace(); TestUtils.deleteAll(dataStore); Assert.fail( "Error occurred while testing a bounding box query of spatial index: '" + e.getLocalizedMessage() + "'"); } // Clean up TestUtils.deleteAll(dataStore); }
Example 16
Source File: PdbToUniProt.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * Returns an up-to-date dataset of PDB to UniProt * residue-level mappings for a list of ids. * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A). * This method reads a cached file and downloads updates. * * @param ids list of pdbIds or pdbId.chainIds * @return dataset of PDB to UniProt residue-level mappings * @throws IOException */ public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException { SparkSession spark = SparkSession.builder().getOrCreate(); boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4; // create dataset of ids Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id"); // get cached mappings Dataset<Row> mapping = getCachedResidueMappings(); // dataset for non-cached mappings Dataset<Row> notCached = null; // dataset with PDB Ids to be downloaded Dataset<Row> toDownload = null; if (withChainId) { // get subset of requested ids from cached dataset mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id"); // get ids that are not in the cached dataset notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); // create dataset of PDB Ids to be downloaded toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache(); } else { // get subset of requested ids from cached dataset mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4)); mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id"); // create dataset of PDB Ids to be downloaded toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache(); mapping = mapping.drop("pdbId"); } toDownload = toDownload.distinct().cache(); // download data that are not in the cache if (toDownload.count() > 0) { Dataset<Row> unpData = getChainMappings().select("structureId").distinct(); toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache(); System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures."); Dataset<Row> downloadedData = downloadData(toDownload); // since data are downloaded for all chains in structure, make sure to only include the requested chains. if (withChainId) { downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id"); } mapping = mapping.union(downloadedData); } return mapping; }
Example 17
Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // filter by sequence identity subset int sequenceIdentity = 20; double resolution = 2.0; pdb = pdb.filter(new Pisces(sequenceIdentity, resolution)); // find ATP interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // TODO add a line to only analyze interactions // with the oxygens in the terminal phosphate group of ATP // (O1G, O2G, O3G) // Tip: Google SQL LIKE interactions = interactions.filter("atom1 LIKE('O%G')"); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .sort(col("frequency").desc()) // sort descending .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 18
Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args no input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // get non-redundant subset pdb = pdb.filter(new Pisces(40, 2.5)); // find Zinc interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(20); // TODO print the top 10 interacting elements System.out.println("Top interacting elements"); Dataset<Row> topElements = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("element2") .count(); topElements.withColumn("frequency", col("count").divide(n)) .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(10); interactions .groupBy("element2") .avg("distance") .sort("avg(distance)") .show(10); // Aggregate multiple statistics // Note: import static org.apache.spark.sql.functions.* required! // e.g. org.apache.spark.sql.functions.avg // for a list of all available functions interactions .groupBy("element2") .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance")) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 19
Source File: DatasetFileConverter.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { // process command line options (defaults are provided) CommandLine cmd = getCommandLine(args); String inputFile = cmd.getOptionValue("input-file"); int partitions = Integer.parseInt(cmd.getOptionValue("partitions", "0")); String fileFormat = cmd.getOptionValue("file-format", ""); String compressionCodec = cmd.getOptionValue("compression-codec", ""); SparkSession spark = SparkSession.builder().master("local[*]") .appName(DatasetFileConverter.class.getSimpleName()) .getOrCreate(); spark.conf().set("spark.sql.orc.impl", "native"); // encode options in file name String outputFile = getFileExtension(inputFile); if (partitions > 1) { outputFile += "." + partitions; } outputFile += "." + fileFormat; if (!compressionCodec.isEmpty()) { outputFile += "." + compressionCodec; } System.out.println("Input file : " + inputFile); System.out.println("Output file: " + outputFile); long t1 = System.nanoTime(); Dataset<Row> dataset = null; // read dataset if (inputFile.contains("orc")) { dataset = spark.read().format("orc").load(inputFile); } else if (inputFile.contains("csv")) { dataset = spark.read().format("csv").load(inputFile); } else { dataset = spark.read().format("parquet").load(inputFile); } long records = dataset.count(); // write reformatted dataset saveDataset(dataset, partitions, fileFormat, compressionCodec, outputFile); long t2 = System.nanoTime(); System.out.println(records + " records reformatted in " + (t2-t1)/1E9 + " sec."); spark.stop(); }
Example 20
Source File: CreatePdbToUniProtMappingFile.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException { // process command line options (defaults are provided) CommandLine cmd = getCommandLine(args); String outputFile = cmd.getOptionValue("output-file"); boolean build = cmd.hasOption("build"); boolean update = cmd.hasOption("update"); // these default options for fileFormat and compressionCodec // provide the best compression String fileFormat = cmd.getOptionValue("file-format", "orc"); String compressionCodec = cmd.getOptionValue("compression-codec", "lzo"); SparkSession spark = SparkSession.builder() .master("local[*]") .appName(CreatePdbToUniProtMappingFile.class.getSimpleName()) .getOrCreate(); String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); long t1 = System.nanoTime(); String dirName = outputFile + "_" + timeStamp + "_tmp"; String fileName = outputFile + "_" + timeStamp + "." + fileFormat + "." + compressionCodec; if (build) { // create a new mapping file from scratch PdbToUniProt.buildDataset(dirName, "orc", "lzo"); } else if (update) { // create an updated mapping file from the cached version PdbToUniProt.updateDataset(dirName, "orc", "lzo"); } long t2 = System.nanoTime(); System.out.println("Time to build/update dataset: " + (t2-t1)/1E9 + " sec."); // By default, spark creates a directory of files. // For convenience, coalesce the data into a single file. Dataset<Row> ds = spark.read().orc(dirName); long count = ds.count(); int partitions = 1; DatasetFileConverter.saveDataset(ds, partitions, fileFormat, compressionCodec, fileName); FileUtils.deleteDirectory(new File(dirName)); System.out.println(count + " records saved to: " + fileName); long t3 = System.nanoTime(); System.out.println("Time to reformat data: " + (t3-t2)/1E9 + " sec."); spark.stop(); }