org.apache.spark.sql.Dataset#count

Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

	if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep");
        return dataset;
    }
	
	String query = (String) parameters.get("query");               
    BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + ".");
    dataset = dataset.filter(query);

    dataset.cache();
    if(dataset.count() == 0) {
        BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query.");
        System.exit(1);
    }
           
    return dataset;
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T14: Append mode " + containerOut);
  String o1 = containerOut + "myData";
  try {
    createAppendObject("T14 - first append", schemaFlights, o1, type);
    long baseCount = schemaFlights.count();
    System.out
        .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1);
    createAppendObject("T14 - second append", schemaFlights, o1, type);
    baseCount = schemaFlights.count();
    System.out
        .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), true);
  }
}

Source File: LoopStep.java From envelope with Apache License 2.0

6 votes

private List<Row> getRowsFromStep(Set<Step> steps) {
  String stepName = config.getString(STEP_PROPERTY);
  Optional<Step> optionalStep = StepUtils.getStepForName(stepName, steps);
  
  if (!optionalStep.isPresent()) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' does not exist.");
  }
  
  Step step = optionalStep.get();
  
  if (!(step instanceof DataStep)) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' is not a data step.");
  }
  
  Dataset<Row> stepRows = ((DataStep)step).getData();
  
  if (stepRows.count() > 1000) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' can not provide more than 1000 values to loop over");
  }

  return stepRows.collectAsList();
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test13(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T13: Going to create nested structures and check globber on " + containerOut);
  String o1 = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64d";
  try {
    if (dataCreate) {
      createObject("T13", schemaFlights, o1, type);
    }
    long baseCount = schemaFlights.count();
    String path = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64";

    System.out.println(
        "***T13-1 : Reading " + path + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T13-1-" + type, type, path, spark, baseCount, 1);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}

Source File: CountDatasetRule.java From envelope with Apache License 2.0

6 votes

@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
  if (isDependency()) {
    Dataset<Row> expectedDependency = stepDependencies.get(dependency);
    if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1
        && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) {
      expected = expectedDependency.collectAsList().get(0).getLong(0);
    } else {
      throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type");
    }
  }
  if (expected < 0) {
    throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency");
  }
  return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA));
}

Source File: ConceptMaps.java From bunsen with Apache License 2.0

6 votes

@Override
public ConceptMaps withConceptMaps(Dataset<Row> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  // Dataset<Row> withoutConcepts =
  JavaRDD<Row> withoutConceptsRdd =
      conceptMaps.javaRDD().map(new ConceptMapRemover());

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      conceptMapRowConverter.getSchema());

  Dataset<Mapping> newMappings = conceptMaps.flatMap((Row row) -> expandMappingsIterator(row),
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}

Source File: ValueSets.java From bunsen with Apache License 2.0

5 votes

/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<Row> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  JavaRDD<Row> valueSetsRdd = valueSets.javaRDD();

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion));

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      valueSetRowConverter.getSchema());

  JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion));

  Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}

Source File: TestPerformanceRegression.java From chronix.spark with Apache License 2.0

5 votes

public static void main(String[] args) throws SolrServerException, IOException {

        ChronixSparkLoader loader = new ChronixSparkLoader();

        ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext();
        SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext());

        // BENCHMARK START ...............................
        long start = System.currentTimeMillis();
        for (int i = 0; i < LOOPS; i++) {

            //Load data into ChronixRDD
            ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext);

            //Some actions
            double mean = rdd.mean();
            double approxMean = rdd.approxMean();
            long observationCount = rdd.countObservations();
            double max = rdd.max();
            double min = rdd.min();
            Iterator<MetricTimeSeries> it = rdd.iterator();
            while (it.hasNext()) {
                MetricTimeSeries mts = it.next();
                System.out.print(".");
            }

            //DataFrame operations
            Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext);
            ds.count();
        }
        long stop = System.currentTimeMillis();
        // BENCHMARK STOP ...................................
        System.out.println("\nBenchmark duration: " + (stop - start) + " ms");

        chronixSparkContext.getSparkContext().close();
    }

Source File: TestSuite.java From stocator with Apache License 2.0

5 votes

private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception {
  long totalInSpark = inSpark.count();
  if (totalInSpark != readRecords) {
    System.out.println("*********************************");
    System.out.println(msg + ": Records that were written into object store doesn't match");
    System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
    throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  } else {
    System.out.println(
        msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  }
}

Source File: DecisionStep.java From envelope with Apache License 2.0

5 votes

private boolean evaluateStepByKeyDecision(Set<Step> steps) {
  Optional<Step> optionalStep = StepUtils.getStepForName(stepByKeyStepName, steps);
  
  if (!optionalStep.isPresent()) {
    throw new RuntimeException("Unknown decision step's key step: " + stepByValueStepName);
  }
  
  if (!(optionalStep.get() instanceof DataStep)) {
    throw new RuntimeException("Decision step's key step is not a data step: " + optionalStep.get().getName());
  }
  
  Dataset<Row> keyDataset = ((DataStep)optionalStep.get()).getData();
  
  if (keyDataset.schema().fields().length != 2 ||
      keyDataset.schema().fields()[0].dataType() != DataTypes.StringType ||
      keyDataset.schema().fields()[1].dataType() != DataTypes.BooleanType)
  {
    throw new RuntimeException("Decision step's key step must contain a string column and then a boolean column");
  }
  
  String keyColumnName = keyDataset.schema().fieldNames()[0];
  String whereClause = keyColumnName + " = '" + stepByKeyKey + "'";
  Dataset<Row> decisionDataset = keyDataset.where(whereClause);
  
  if (decisionDataset.count() != 1) {
    throw new RuntimeException("Decision step's key step must contain a single record for the given key");
  }
  
  boolean decision = decisionDataset.collectAsList().get(0).getBoolean(1);
  
  return decision;
}

Source File: ConceptMaps.java From bunsen with Apache License 2.0

5 votes

@Override
public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  Dataset<ConceptMap> withoutConcepts = conceptMaps
      .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> {

        // Remove the elements rather than the groups to preserved the
        // "unmapped" structure in a group that can refer to other
        // concept maps.
        ConceptMap withoutElements = conceptMap.copy();

        List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>();

        for (ConceptMapGroupComponent group: withoutElements.getGroup()) {

          group.setElement(new ArrayList<>());
          updatedGroups.add(group);
        }

        withoutElements.setGroup(updatedGroups);

        return withoutElements;
      }, CONCEPT_MAP_ENCODER);

  Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator,
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}

Source File: ValueSets.java From bunsen with Apache License 2.0

5 votes

/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<ValueSet> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> {
    ValueSet valueSetWithoutConcepts = valueSet.copy();

    List<ConceptSetComponent> updatedInclusions = new ArrayList<>();

    for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) {

      ConceptSetComponent inclusionWithoutConcepts = inclusion.copy();

      inclusionWithoutConcepts.setConcept(new ArrayList<>());
      updatedInclusions.add(inclusionWithoutConcepts);
    }

    valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions);

    return valueSetWithoutConcepts;
  }, VALUE_SET_ENCODER);

  Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator,
      getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}

Source File: MetroAnalysisJob.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }

Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void queryTest(CubeSegment segment) {
    // Result cmp: Parquet vs Spark SQL
    for (LayoutEntity entity : MetadataConverter.extractEntityList2JavaList(segment.getCubeInstance())) {
        // Parquet result
        Dataset<Row> layoutDataset = StorageFactory.createEngineAdapter(new IStorageAware() { // Hardcode
            @Override
            public int getStorageType() {
                return 4;
            }
        }, NSparkCubingEngine.NSparkCubingStorage.class)
                .getFrom(PathManager.getParquetStoragePath(segment.getConfig(), segment.getCubeInstance().getName(),
                        segment.getName(), segment.getStorageLocationIdentifier(), String.valueOf(entity.getId())), ss);

        Set<Integer> measures = new HashSet<Integer>();
        Set<Integer> rowKeys = entity.getOrderedDimensions().keySet();
        for (Map.Entry<Integer, FunctionDesc> entry : entity.getOrderedMeasures().entrySet()) {
            String type = entry.getValue().returnType().dataType();
            if (type.equals("hllc") || type.equals("topn") || type.equals("percentile")) {
                continue;
            }
            measures.add(entry.getKey());
        }
        layoutDataset = layoutDataset.select(NSparkCubingUtil.getColumns(rowKeys, measures))
                .sort(NSparkCubingUtil.getColumns(rowKeys));
        System.out.println("Query cuboid ------------ " + entity.getId());
        layoutDataset = dsConvertToOriginal(layoutDataset, entity);
        layoutDataset.show(10);

        // Spark sql
        Dataset<Row> ds = initFlatTable(segment);
        if (!entity.isTableIndex()) {
            ds = CuboidAggregator.agg(ss, ds, entity.getOrderedDimensions().keySet(), entity.getOrderedMeasures(),
                    null, true);
        }
        Dataset<Row> exceptDs = ds.select(NSparkCubingUtil.getColumns(rowKeys, measures))
                .sort(NSparkCubingUtil.getColumns(rowKeys));
        System.out.println("Spark sql ------------ ");
        exceptDs.show(10);
        long layoutCount = layoutDataset.count();
        long expectCount = exceptDs.count();
        Assert.assertEquals(layoutCount, expectCount);

        String msg = SparkQueryTest.checkAnswer(layoutDataset, exceptDs, false);
        Assert.assertNull(msg);
    }
}

Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0

4 votes

@Test
public void testCreateDataFrame() throws Exception {
  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();
  final SparkContext context = session.sparkContext();

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  try {
    // Load RDD from datastore, no filters
    final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions());
    final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD();

    final long count = javaRdd.count();
    LOGGER.warn("DataStore loaded into RDD with " + count + " features.");

    queryRunner.addInputStore(dataStore, null, "features");

    final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))";

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)");

    Dataset<Row> results = queryRunner.run();
    final long containsCount = results.count();
    LOGGER.warn("Got " + containsCount + " for GeomContains test");

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))");
    results = queryRunner.run();
    final long withinCount = results.count();
    LOGGER.warn("Got " + withinCount + " for GeomWithin test");

    Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount);

    // Test the output writer
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore);

    sqlResultsWriter.writeResults("sqltest");

    queryRunner.removeAllStores();

    // Test other spatial UDFs
    final String line1 = "LINESTRING(0 0, 10 10)";
    final String line2 = "LINESTRING(0 10, 10 0)";
    queryRunner.setSql(
        "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    Row result = queryRunner.run().head();

    final boolean intersect = result.getBoolean(0);
    LOGGER.warn("GeomIntersects returned " + intersect);

    Assert.assertTrue("Lines should intersect", intersect);

    queryRunner.setSql(
        "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    result = queryRunner.run().head();

    final boolean disjoint = result.getBoolean(0);
    LOGGER.warn("GeomDisjoint returned " + disjoint);

    Assert.assertFalse("Lines should not be disjoint", disjoint);

  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while testing a bounding box query of spatial index: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}

Source File: PdbToUniProt.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Returns an up-to-date dataset of PDB to UniProt 
 * residue-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A).
 * This method reads a cached file and downloads updates.
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt residue-level mappings
 * @throws IOException
 */
public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4;
    
    // create dataset of ids
    Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    // get cached mappings
    Dataset<Row> mapping = getCachedResidueMappings();  
    
    // dataset for non-cached mappings
    Dataset<Row> notCached = null;
    // dataset with PDB Ids to be downloaded
    Dataset<Row> toDownload = null; 
    
    if (withChainId) {
        // get subset of requested ids from cached dataset
        mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id");
        // get ids that are not in the cached dataset
        notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); 
        // create dataset of PDB Ids to be downloaded
        toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache();
    } else {
        // get subset of requested ids from cached dataset
        mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4));
        mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id");
        // create dataset of PDB Ids to be downloaded
        toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache();
        mapping = mapping.drop("pdbId");
    }
    
    toDownload = toDownload.distinct().cache();
        
    // download data that are not in the cache
    if (toDownload.count() > 0) {
        Dataset<Row> unpData = getChainMappings().select("structureId").distinct();
        toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache();
        System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures.");
        Dataset<Row> downloadedData = downloadData(toDownload);
  
        // since data are downloaded for all chains in structure, make sure to only include the requested chains.
        if (withChainId) {
            downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id");
        }
        mapping = mapping.union(downloadedData);
    }
    
    return mapping;
}

Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // TODO add a line to only analyze interactions 
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: DatasetFileConverter.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {

        // process command line options (defaults are provided)
        CommandLine cmd = getCommandLine(args);
        String inputFile = cmd.getOptionValue("input-file");
        int partitions = Integer.parseInt(cmd.getOptionValue("partitions", "0"));
        String fileFormat = cmd.getOptionValue("file-format", "");
        String compressionCodec = cmd.getOptionValue("compression-codec", "");

        SparkSession spark = SparkSession.builder().master("local[*]")
                .appName(DatasetFileConverter.class.getSimpleName())
                .getOrCreate();

        spark.conf().set("spark.sql.orc.impl", "native");

        // encode options in file name
        String outputFile = getFileExtension(inputFile);
        if (partitions > 1) {
            outputFile += "." + partitions;
        }
        outputFile += "." + fileFormat;
        if (!compressionCodec.isEmpty()) {
            outputFile += "." + compressionCodec;
        }

        System.out.println("Input file : " + inputFile);
        System.out.println("Output file: " + outputFile);

        long t1 = System.nanoTime();

        Dataset<Row> dataset = null;
        
        // read dataset
        if (inputFile.contains("orc")) {
            dataset = spark.read().format("orc").load(inputFile);
        } else if (inputFile.contains("csv")) {
            dataset = spark.read().format("csv").load(inputFile);
        } else {
            dataset = spark.read().format("parquet").load(inputFile);
        }
        
        long records = dataset.count();

        // write reformatted dataset
        saveDataset(dataset, partitions, fileFormat, compressionCodec, outputFile);

        long t2 = System.nanoTime();

        System.out.println(records + " records reformatted in " + (t2-t1)/1E9 + " sec.");

        spark.stop();
    }

Source File: CreatePdbToUniProtMappingFile.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException {
    
    // process command line options (defaults are provided)
    CommandLine cmd = getCommandLine(args);
    String outputFile = cmd.getOptionValue("output-file");
    boolean build = cmd.hasOption("build");
    boolean update = cmd.hasOption("update");
    
    // these default options for fileFormat and compressionCodec 
    // provide the best compression
    String fileFormat = cmd.getOptionValue("file-format", "orc");
    String compressionCodec = cmd.getOptionValue("compression-codec", "lzo");
   
    SparkSession spark = SparkSession.builder()
            .master("local[*]")
            .appName(CreatePdbToUniProtMappingFile.class.getSimpleName())
            .getOrCreate();
    
    String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());

    long t1 = System.nanoTime();
    
    String dirName = outputFile + "_" + timeStamp + "_tmp";
    String fileName = outputFile + "_" + timeStamp + "." + fileFormat + "." + compressionCodec;
    
    if (build) {
        // create a new mapping file from scratch
        PdbToUniProt.buildDataset(dirName, "orc", "lzo");
    } else if (update) {
        // create an updated mapping file from the cached version
        PdbToUniProt.updateDataset(dirName, "orc", "lzo");
    }

    long t2 = System.nanoTime();
    System.out.println("Time to build/update dataset: " + (t2-t1)/1E9 + " sec.");
           
    // By default, spark creates a directory of files. 
    // For convenience, coalesce the data into a single file.
    Dataset<Row> ds = spark.read().orc(dirName);
    long count = ds.count();
    
    int partitions = 1;
    DatasetFileConverter.saveDataset(ds, partitions, fileFormat, compressionCodec, fileName);
    FileUtils.deleteDirectory(new File(dirName));
    
    System.out.println(count + " records saved to: " + fileName);
    
    long t3 = System.nanoTime();
    System.out.println("Time to reformat data: " + (t3-t2)/1E9 + " sec.");

    spark.stop();
}

Java Code Examples for org.apache.spark.sql.Dataset#count()