org.apache.spark.sql.RowFactory Java Examples
The following examples show how to use
org.apache.spark.sql.RowFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatasetBalancerTest.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Test public void test() { List<Row> rows = Arrays.asList( RowFactory.create("a", 1), RowFactory.create("a", 2), RowFactory.create("b", 1), RowFactory.create("b", 2), RowFactory.create("b", 3), RowFactory.create("c", 1), RowFactory.create("c", 2), RowFactory.create("c", 3), RowFactory.create("c", 4)); SparkSession spark = SparkSession.builder().master("local[1]").getOrCreate(); StructType schema = new StructType( new StructField[] { DataTypes.createStructField("key", DataTypes.StringType, false), DataTypes.createStructField("value", DataTypes.IntegerType, false) }); Dataset<Row> data = spark.createDataFrame(rows, schema); long seed = 19; Dataset<Row> balancedData = DatasetBalancer.downsample(data, "key", seed); assertTrue(balancedData.count() > 0); spark.close(); }
Example #2
Source File: StructureToSecondaryStructureSegments.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Override public Iterator<Row> call(Row t) throws Exception { //get information from the input Row String structureChainId = t.getString(0); String sequence = t.getString(1); String dsspQ8 = t.getString(5); String dsspQ3 = t.getString(6); int numSegments = Math.max(0, sequence.length() - length); List<Row> sequences = new ArrayList<>(numSegments); for (int i = 0; i < sequence.length() - length; i++) { String currSeq = sequence.substring(i, i+length); String labelQ8 = dsspQ8.substring(i + length/2,i + length/2 + 1); String labelQ3 = dsspQ3.substring(i + length/2,i + length/2 + 1); if ( !labelQ8.equals("X") && !labelQ3.equals("X")) { sequences.add( RowFactory.create(structureChainId, currSeq, labelQ8, labelQ3) ); } } return sequences.iterator(); }
Example #3
Source File: JavaBinarizerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBinarizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); binarizedDataFrame.show(); // $example off$ spark.stop(); }
Example #4
Source File: JavaSQLTransformerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaSQLTransformerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 1.0, 3.0), RowFactory.create(2, 2.0, 5.0) ); StructType schema = new StructType(new StructField [] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); SQLTransformer sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); sqlTrans.transform(df).show(); // $example off$ spark.stop(); }
Example #5
Source File: AtomInteraction.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Returns rows of pairwise interactions with the central atom. * * @return rows of pairwise interactions with the central atom */ public List<Row> getPairInteractionsAsRows() { List<Row> rows = new ArrayList<>(neighbors.size()); int length = InteractionCenter.getLength(); calcCoordinationGeometry(neighbors.size()); // copy data of the interacting atoms for (int i = 0; i < neighbors.size(); i++) { Object[] data = new Object[2 * length + 2]; int index = 0; data[index++] = structureId; System.arraycopy(center.getAsObject(), 0, data, index, length); index += length; System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length); index += length; data[index++] = distances[i]; rows.add(RowFactory.create(data)); } return rows; }
Example #6
Source File: RelationExtractor.java From rdf2x with Apache License 2.0 | 6 votes |
/** * Map a {@link Instance} into an Iterator of all of its relations * represented as rows of (related URI, predicate index, type index, instance ID) * * @param instance the requested {@link Instance} * @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID) */ private Iterable<Row> getRelatedTypeIDs(Instance instance) { // typeIDs representing references to the instance in each table (or a single one, if instance has a single type) final Long id = instance.getId(); final List<Tuple2<Integer, Long>> instanceTypeIDs = getRelationEntityTypes(instance) .map(typeIndex -> new Tuple2<>(typeIndex, id)) .collect(Collectors.toList()); return instance.getRelations().stream() .flatMap(relation -> instanceTypeIDs.stream() .map(instanceTypeID -> RowFactory.create( relation.getObjectURI(), relation.getPredicateIndex(), instanceTypeID._1(), instanceTypeID._2() )) ).collect(Collectors.toList()); }
Example #7
Source File: JavaStopWordsRemoverExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaStopWordsRemoverExample") .getOrCreate(); // $example on$ StopWordsRemover remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered"); List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); StructType schema = new StructType(new StructField[]{ new StructField( "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).show(false); // $example off$ spark.stop(); }
Example #8
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
@Override public Iterator<Row> call(Tuple2<Long, FrameBlock> arg0) throws Exception { long rowIndex = arg0._1(); FrameBlock blk = arg0._2(); ArrayList<Row> ret = new ArrayList<>(); //handle Frame block data int rows = blk.getNumRows(); int cols = blk.getNumColumns(); for( int i=0; i<rows; i++ ) { Object[] row = new Object[cols+1]; row[0] = (double)rowIndex++; for( int j=0; j<cols; j++ ) row[j+1] = blk.get(i, j); ret.add(RowFactory.create(row)); } return ret.iterator(); }
Example #9
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Write metadata describing entity tables * * @param entitySchema the entity schema */ public void writeEntityMetadata(EntitySchema entitySchema) { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI)); List<Tuple2<String, String>> primaryKeys = new ArrayList<>(); indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME)); final Map<String, String> uriLabels = rdfSchema.getUriLabels(); // create table rows List<Row> rows = entitySchema.getTables().stream() .map(table -> { Object[] valueArray = new Object[]{ table.getName(), table.getTypeURI(), uriLabels.get(table.getTypeURI()), table.getNumRows() }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); // create and write the META_Entities dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(ENTITIES_TABLE_NAME, df); persistor.createPrimaryKeys(primaryKeys); persistor.createIndexes(indexes); df.unpersist(); }
Example #10
Source File: JavaElementwiseProductExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaElementwiseProductExample") .getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList( RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) ); List<StructField> fields = new ArrayList<StructField>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct() .setScalingVec(transformingVector) .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop(); }
Example #11
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
@Test public void testWriteRelationTablesWithPredicateIndex() throws IOException { InstanceRelationWriter writer = new InstanceRelationWriter(config .setStorePredicate(true), jsc(), persistor, rdfSchema); writer.writeRelationTables(getTestRelationSchema(), getTestRelations()); List<Row> rows = new ArrayList<>(); rows.add(RowFactory.create(1L, 3L, uriIndex.getIndex("http://example.com/knows"))); rows.add(RowFactory.create(2L, 3L, uriIndex.getIndex("http://example.com/likes"))); DataFrame result = this.result.values().iterator().next(); assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(true, false), result.schema()); assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD()); }
Example #12
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
@Test public void testWriteRelationTablesWithoutPredicateIndex() throws IOException { InstanceRelationWriter writer = new InstanceRelationWriter(config .setStorePredicate(false), jsc(), persistor, rdfSchema); writer.writeRelationTables(getTestRelationSchema(), getTestRelations()); List<Row> rows = new ArrayList<>(); rows.add(RowFactory.create(1L, 3L)); rows.add(RowFactory.create(2L, 3L)); DataFrame result = this.result.values().iterator().next(); assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(false, false), result.schema()); assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD()); }
Example #13
Source File: RelationExtractorTest.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Test if expected directed relations are collected from a RDD of Instances */ @Test public void testCollectRelations() { SQLContext sql = new SQLContext(jsc()); RelationExtractor collector = new RelationExtractor( new RelationConfig(), jsc(), new ClassGraph() ); List<Row> rdd = new ArrayList<>(); // cycle one -> two -> three -> one rdd.add(RowFactory.create(0, 1, 1L, 1, 2L)); rdd.add(RowFactory.create(0, 1, 2L, 1, 3L)); rdd.add(RowFactory.create(0, 1, 3L, 1, 1L)); // one -> four, four -> one rdd.add(RowFactory.create(0, 2, 4L, 1, 1L)); rdd.add(RowFactory.create(0, 1, 1L, 2, 4L)); // five -> one rdd.add(RowFactory.create(0, 3, 5L, 1, 1L)); DataFrame expected = sql.createDataFrame(rdd, new StructType() .add("predicateIndex", DataTypes.IntegerType, false) .add("fromTypeIndex", DataTypes.IntegerType, false) .add("fromID", DataTypes.LongType, false) .add("toTypeIndex", DataTypes.IntegerType, false) .add("toID", DataTypes.LongType, false) ); // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID) DataFrame result = collector.extractRelations(getTestRDD()); assertEquals("Expected relation row schema is collected", expected.schema(), result.schema()); assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD()); }
Example #14
Source File: InstanceRelationWriter.java From rdf2x with Apache License 2.0 | 5 votes |
private static Row getAttributeRow(Instance instance, Predicate predicate, Object value) { return RowFactory.create( instance.getId(), predicate.getPredicateIndex(), LiteralType.toString(predicate.getLiteralType()), predicate.getLanguage(), value.toString() ); }
Example #15
Source File: JavaBucketizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBucketizerExample") .getOrCreate(); // $example on$ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; List<Row> data = Arrays.asList( RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9) ); StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits); // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets"); bucketedData.show(); // $example off$ spark.stop(); }
Example #16
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Write metadata describing relation tables * * @param relationSchema the relation schema */ public void writeRelationMetadata(RelationSchema relationSchema) { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RELATIONS_NAME, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(RELATIONS_FROM_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(RELATIONS_TO_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(RELATIONS_PREDICATE_ID, DataTypes.IntegerType, true)); // create table rows List<Row> rows = relationSchema.getTables().stream() .map(table -> { RelationPredicateFilter predicateFilter = table.getPredicateFilter(); RelationEntityFilter entityFilter = table.getEntityFilter(); Object[] valueArray = new Object[]{ table.getName(), entityFilter == null ? null : entityFilter.getFromTypeName(), entityFilter == null ? null : entityFilter.getToTypeName(), predicateFilter == null ? null : rdfSchema.getPredicateIndex().getIndex(predicateFilter.getPredicateURI()) }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); StructType schema = DataTypes.createStructType(fields); // add index for each field List<Tuple2<String, String>> indexes = fields.stream() .map(field -> new Tuple2<>(RELATIONS_TABLE_NAME, field.name())) .collect(Collectors.toList()); // create and write the META_Relations dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(RELATIONS_TABLE_NAME, df); persistor.createIndexes(indexes); df.unpersist(); }
Example #17
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Persist predicate metadata table storing all predicates. */ public void writePredicateMetadata() { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(PREDICATE_ID, DataTypes.IntegerType, false)); fields.add(DataTypes.createStructField(PREDICATE_URI, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(PREDICATE_LABEL, DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_URI)); List<Tuple2<String, String>> primaryKeys = new ArrayList<>(); primaryKeys.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_ID)); final IndexMap<String> predicateIndex = rdfSchema.getPredicateIndex(); final Map<String, String> uriLabels = rdfSchema.getUriLabels(); // create table rows List<Row> rows = predicateIndex.getValues().stream() .map(uri -> { Object[] valueArray = new Object[]{ predicateIndex.getIndex(uri), uri, uriLabels.get(uri) }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); // create and write the META_Predicates dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(PREDICATES_TABLE_NAME, df); persistor.createPrimaryKeys(primaryKeys); persistor.createIndexes(indexes); df.unpersist(); }
Example #18
Source File: RelationExtractor.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Map all types of an {@link Instance} to a row of (instance URI, instance type index, instance ID) * * @param instance the requested {@link Instance} * @return a row of (instance URI, instance type index, instance ID) */ private Iterable<Row> getInstanceTypeIDs(Instance instance) { String instanceURI = instance.getUri(); Long instanceID = instance.getId(); return getRelationEntityTypes(instance) .map(typeIndex -> RowFactory.create(instanceURI, typeIndex, instanceID)) .collect(Collectors.toList()); }
Example #19
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
private JavaRDD<Row> getExpectedRowsOfSingleRelationTable() { List<Row> rows = new ArrayList<>(); rows.add(RowFactory.create(1L, 3L, uriIndex.getIndex("http://example.com/knows"))); rows.add(RowFactory.create(2L, 3L, uriIndex.getIndex("http://example.com/likes"))); return jsc().parallelize(rows); }
Example #20
Source File: JavaWord2VecExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaWord2VecExample") .getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" "))) ); StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> documentDF = spark.createDataFrame(data, schema); // Learn a mapping from words to Vectors. Word2Vec word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); for (Row row : result.collectAsList()) { List<String> text = row.getList(0); Vector vector = (Vector) row.get(1); System.out.println("Text: " + text + " => \nVector: " + vector + "\n"); } // $example off$ spark.stop(); }
Example #21
Source File: JavaQuantileDiscretizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaQuantileDiscretizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 18.0), RowFactory.create(1, 19.0), RowFactory.create(2, 8.0), RowFactory.create(3, 5.0), RowFactory.create(4, 2.2) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // $example off$ // Output of QuantileDiscretizer for such small datasets can depend on the number of // partitions. Here we force a single partition to ensure consistent results. // Note this is not necessary for normal use cases df = df.repartition(1); // $example on$ QuantileDiscretizer discretizer = new QuantileDiscretizer() .setInputCol("hour") .setOutputCol("result") .setNumBuckets(3); Dataset<Row> result = discretizer.fit(df).transform(df); result.show(); // $example off$ spark.stop(); }
Example #22
Source File: JavaMinMaxScalerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMinMaxScalerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)), RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)), RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); // Compute summary statistics and generate MinMaxScalerModel MinMaxScalerModel scalerModel = scaler.fit(dataFrame); // rescale each feature to range [min, max]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); System.out.println("Features scaled to range: [" + scaler.getMin() + ", " + scaler.getMax() + "]"); scaledData.select("features", "scaledFeatures").show(); // $example off$ spark.stop(); }
Example #23
Source File: JavaTfIdfExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTfIdfExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); rescaledData.select("label", "features").show(); // $example off$ spark.stop(); }
Example #24
Source File: JavaVectorSlicerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorSlicerExample") .getOrCreate(); // $example on$ Attribute[] attrs = new Attribute[]{ NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) ); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) Dataset<Row> output = vectorSlicer.transform(dataset); output.show(false); // $example off$ spark.stop(); }
Example #25
Source File: JavaNormalizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaNormalizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normFeatures") .setP(1.0); Dataset<Row> l1NormData = normalizer.transform(dataFrame); l1NormData.show(); // Normalize each Vector using $L^\infty$ norm. Dataset<Row> lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ spark.stop(); }
Example #26
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
private DataFrame getTestRelations() { List<Row> rows = new ArrayList<>(); rows.add(RowFactory.create( uriIndex.getIndex("http://example.com/knows"), uriIndex.getIndex("http://example.com/a"), 1L, uriIndex.getIndex("http://example.com/b"), 3L )); rows.add(RowFactory.create( uriIndex.getIndex("http://example.com/likes"), uriIndex.getIndex("http://example.com/a"), 2L, uriIndex.getIndex("http://example.com/b"), 3L )); return sql.createDataFrame(rows, new StructType() .add("predicateIndex", DataTypes.IntegerType, false) .add("fromTypeIndex", DataTypes.IntegerType, false) .add("fromID", DataTypes.LongType, false) .add("toTypeIndex", DataTypes.IntegerType, false) .add("toID", DataTypes.LongType, false) ); }
Example #27
Source File: JavaMinHashLSHExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMinHashLSHExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})), RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})), RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0})) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("keys", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MinHashLSH mh = new MinHashLSH() .setNumHashTables(1) .setInputCol("keys") .setOutputCol("values"); MinHashLSHModel model = mh.fit(dataFrame); model.transform(dataFrame).show(); // $example off$ spark.stop(); }
Example #28
Source File: JavaVectorAssemblerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorAssemblerExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("hour", IntegerType, false), createStructField("mobile", DoubleType, false), createStructField("userFeatures", new VectorUDT(), false), createStructField("clicked", DoubleType, false) }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + "'features'"); output.select("features", "clicked").show(false); // $example off$ spark.stop(); }
Example #29
Source File: JavaRFormulaExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaRFormulaExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("country", StringType, false), createStructField("hour", IntegerType, false), createStructField("clicked", DoubleType, false) }); List<Row> data = Arrays.asList( RowFactory.create(7, "US", 18, 1.0), RowFactory.create(8, "CA", 12, 0.0), RowFactory.create(9, "NZ", 15, 0.0) ); Dataset<Row> dataset = spark.createDataFrame(data, schema); RFormula formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label"); Dataset<Row> output = formula.fit(dataset).transform(dataset); output.select("features", "label").show(); // $example off$ spark.stop(); }
Example #30
Source File: QuaternaryStructureDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
private static Iterator<Row> getQuaternaryStructure(Tuple2<String, StructureDataInterface> t) throws Exception { List<Row> rows = new ArrayList<>(); String key = t._1; StructureDataInterface structure = t._2; ColumnarStructure cs = new ColumnarStructure(structure, true); String[] chainEntityTypes = cs.getChainEntityTypes(); int[] chainToEntityIndex = cs.getChainToEntityIndices(); for (int i = 0; i < structure.getNumBioassemblies(); i++) { List<Integer> proteinIndices = new ArrayList<>(); List<Integer> dnaIndices = new ArrayList<>(); List<Integer> rnaIndices = new ArrayList<>(); for (int j = 0; j < structure.getNumTransInBioassembly(i); j++) { for (int chainIndex : structure.getChainIndexListForTransform(i, j)) { int entityIndex = chainToEntityIndex[chainIndex]; String type = chainEntityTypes[chainIndex]; if (type.equals("PRO")) { proteinIndices.add(entityIndex); } else if (type.equals("DNA")) { dnaIndices.add(entityIndex); } else if (type.equals("RNA")) { rnaIndices.add(entityIndex); } } } String proStoich = stoichiometry(coefficients(proteinIndices)); String dnaStoich = stoichiometry(coefficients(dnaIndices)); String rnaStoich = stoichiometry(coefficients(rnaIndices)); rows.add(RowFactory.create(key, structure.getBioassemblyName(i), proStoich, dnaStoich, rnaStoich)); } return rows.iterator(); }