org.apache.spark.sql.types.StructField Java Examples
The following examples show how to use
org.apache.spark.sql.types.StructField.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextUtil.java From systemds with Apache License 2.0 | 7 votes |
/** * Examine the DataFrame schema to determine whether the data appears to be * a matrix. * * @param df * the DataFrame * @return {@code true} if the DataFrame appears to be a matrix, * {@code false} otherwise */ public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) { StructType schema = df.schema(); StructField[] fields = schema.fields(); if (fields == null) { return true; } for (StructField field : fields) { DataType dataType = field.dataType(); if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType) && (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT)) && (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) { // uncomment if we support arrays of doubles for matrices // if (dataType instanceof ArrayType) { // ArrayType arrayType = (ArrayType) dataType; // if (arrayType.elementType() == DataTypes.DoubleType) { // continue; // } // } return false; } } return true; }
Example #2
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLDoublesWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, doubles with ID column, no format specified"); List<String> list = new ArrayList<>(); list.add("1,2,2,2"); list.add("2,3,3,3"); list.add("3,4,4,4"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 27.0"); ml.execute(script); }
Example #3
Source File: SchemaConverter.java From geowave with Apache License 2.0 | 6 votes |
public static SimpleFeatureType schemaToFeatureType( final StructType schema, final String typeName) { final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder(); typeBuilder.setName(typeName); typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE); try { typeBuilder.setCRS(CRS.decode("EPSG:4326", true)); } catch (final FactoryException e) { LOGGER.error(e.getMessage(), e); } final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder(); for (final StructField field : schema.fields()) { final AttributeDescriptor attrDesc = attrDescFromStructField(attrBuilder, field); typeBuilder.add(attrDesc); } return typeBuilder.buildFeatureType(); }
Example #4
Source File: UnaryTransformer.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
@Override public StructType transformSchema(StructType structType) { String inputCol = getInputCol(); String outputCol = getOutputCol(); DataType inputType = structType.apply(inputCol).dataType(); this.validateInputType(inputType); List<String> names = Arrays.asList(structType.fieldNames()); Cond.require(!names.contains(outputCol), "The output column " + outputCol + " already exists in this schema!"); List<StructField> fields = new ArrayList<>(); for (int i = 0; i < structType.fields().length; i++) { fields.add(structType.fields()[i]); } DataType dt = getOutputDataType(); fields.add(DataTypes.createStructField(outputCol, dt, isOutputDataTypeNullable())); return DataTypes.createStructType(fields); }
Example #5
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #6
Source File: TestChainedTransformer.java From hudi with Apache License 2.0 | 6 votes |
@Test public void testChainedTransformation() { StructType schema = DataTypes.createStructType( new StructField[] { createStructField("foo", StringType, false) }); Row r1 = RowFactory.create("100"); Row r2 = RowFactory.create("200"); Dataset<Row> original = sparkSession.sqlContext().createDataFrame(Arrays.asList(r1, r2), schema); Transformer t1 = (jsc, sparkSession, dataset, properties) -> dataset.withColumnRenamed("foo", "bar"); Transformer t2 = (jsc, sparkSession, dataset, properties) -> dataset.withColumn("bar", dataset.col("bar").cast(IntegerType)); ChainedTransformer transformer = new ChainedTransformer(Arrays.asList(t1, t2)); Dataset<Row> transformed = transformer.apply(jsc, sparkSession, original, null); assertEquals(2, transformed.count()); assertArrayEquals(new String[] {"bar"}, transformed.columns()); List<Row> rows = transformed.collectAsList(); assertEquals(100, rows.get(0).getInt(0)); assertEquals(200, rows.get(1).getInt(0)); }
Example #7
Source File: TestRowUtils.java From envelope with Apache License 2.0 | 6 votes |
@Test public void testRemoveOneField() { StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true); StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true); StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true); StructType removeSchema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3)); Row remove = new RowWithSchema(removeSchema, "hello", 1, 1.0); Row removed = RowUtils.remove(remove, "field2"); Row expected = new RowWithSchema( DataTypes.createStructType(Lists.newArrayList(field1, field3)), "hello", 1.0); assertEquals(expected, removed); }
Example #8
Source File: IndexRUtil.java From indexr with Apache License 2.0 | 6 votes |
public static SegmentSchema sparkSchemaToIndexRSchema(List<StructField> sparkSchema, IsIndexed isIndexed) { List<ColumnSchema> columns = new ArrayList<>(); for (StructField f : sparkSchema) { SQLType type; if (f.dataType() instanceof IntegerType) { type = SQLType.INT; } else if (f.dataType() instanceof LongType) { type = SQLType.BIGINT; } else if (f.dataType() instanceof FloatType) { type = SQLType.FLOAT; } else if (f.dataType() instanceof DoubleType) { type = SQLType.DOUBLE; } else if (f.dataType() instanceof StringType) { type = SQLType.VARCHAR; } else if (f.dataType() instanceof DateType) { type = SQLType.DATE; } else if (f.dataType() instanceof TimestampType) { type = SQLType.DATETIME; } else { throw new IllegalStateException("Unsupported type: " + f.dataType()); } columns.add(new ColumnSchema(f.name(), type, isIndexed.apply(f.name()))); } return new SegmentSchema(columns); }
Example #9
Source File: TestDecisionStep.java From envelope with Apache License 2.0 | 6 votes |
@Test public void testPruneByStepValueFalse() { StructType schema = new StructType(new StructField[] { new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty()) }); List<Row> rows = Lists.newArrayList( RowFactory.create(false) ); Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema); step1.setData(ds); Map<String, Object> step2ConfigMap = Maps.newHashMap(); step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1")); step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7")); step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD); step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1"); Config step2Config = ConfigFactory.parseMap(step2ConfigMap); RefactorStep step2 = new DecisionStep("step2"); step2.configure(step2Config); steps.add(step2); Set<Step> refactored = step2.refactor(steps); assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6)); }
Example #10
Source File: SimpleFeatureMapper.java From geowave with Apache License 2.0 | 6 votes |
@Override public Row call(final SimpleFeature feature) throws Exception { final Object[] fields = new Serializable[schema.size()]; for (int i = 0; i < schema.size(); i++) { final Object fieldObj = feature.getAttribute(i); if (fieldObj != null) { final StructField structField = schema.apply(i); if (structField.name().equals("geom")) { fields[i] = fieldObj; } else if (structField.dataType() == DataTypes.TimestampType) { fields[i] = new Timestamp(((Date) fieldObj).getTime()); } else if (structField.dataType() != null) { fields[i] = fieldObj; } else { LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj); } } } return new GenericRowWithSchema(fields, schema); }
Example #11
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #12
Source File: RDDConverterUtilsExtTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testStringDataFrameToVectorDataFrameNull() { List<String> list = new ArrayList<>(); list.add("[1.2, 3.4]"); list.add(null); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow()); SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema); Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF); List<String> expectedResults = new ArrayList<>(); expectedResults.add("[[1.2,3.4]]"); expectedResults.add("[null]"); List<Row> outputList = outDF.collectAsList(); for (Row row : outputList) { assertTrue("Expected results don't contain: " + row, expectedResults.contains(row.toString())); } }
Example #13
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testGetTuple1DML() { System.out.println("MLContextTest - Get Tuple1<Matrix> DML"); JavaRDD<String> javaRddString = sc .parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList())); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(javaRddRow, schema); Script script = dml("N=M*2").in("M", df).out("N"); Tuple1<Matrix> tuple = ml.execute(script).getTuple("N"); double[][] n = tuple._1().to2DDoubleArray(); Assert.assertEquals(2.0, n[0][0], 0); Assert.assertEquals(4.0, n[0][1], 0); Assert.assertEquals(6.0, n[0][2], 0); Assert.assertEquals(8.0, n[1][0], 0); Assert.assertEquals(10.0, n[1][1], 0); Assert.assertEquals(12.0, n[1][2], 0); Assert.assertEquals(14.0, n[2][0], 0); Assert.assertEquals(16.0, n[2][1], 0); Assert.assertEquals(18.0, n[2][2], 0); }
Example #14
Source File: MLContextUtil.java From systemds with Apache License 2.0 | 6 votes |
/** * Examine the DataFrame schema to determine whether the data appears to be * a matrix. * * @param df * the DataFrame * @return {@code true} if the DataFrame appears to be a matrix, * {@code false} otherwise */ public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) { StructType schema = df.schema(); StructField[] fields = schema.fields(); if (fields == null) { return true; } for (StructField field : fields) { DataType dataType = field.dataType(); if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType) && (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT)) && (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) { // uncomment if we support arrays of doubles for matrices // if (dataType instanceof ArrayType) { // ArrayType arrayType = (ArrayType) dataType; // if (arrayType.elementType() == DataTypes.DoubleType) { // continue; // } // } return false; } } return true; }
Example #15
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #16
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testGetTuple1DML() { System.out.println("MLContextTest - Get Tuple1<Matrix> DML"); JavaRDD<String> javaRddString = sc .parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList())); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(javaRddRow, schema); Script script = dml("N=M*2").in("M", df).out("N"); Tuple1<Matrix> tuple = ml.execute(script).getTuple("N"); double[][] n = tuple._1().to2DDoubleArray(); Assert.assertEquals(2.0, n[0][0], 0); Assert.assertEquals(4.0, n[0][1], 0); Assert.assertEquals(6.0, n[0][2], 0); Assert.assertEquals(8.0, n[1][0], 0); Assert.assertEquals(10.0, n[1][1], 0); Assert.assertEquals(12.0, n[1][2], 0); Assert.assertEquals(14.0, n[2][0], 0); Assert.assertEquals(16.0, n[2][1], 0); Assert.assertEquals(18.0, n[2][2], 0); }
Example #17
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLDoublesWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, doubles with ID column, no format specified"); List<String> list = new ArrayList<>(); list.add("1,2,2,2"); list.add("2,3,3,3"); list.add("3,4,4,4"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 27.0"); ml.execute(script); }
Example #18
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified"); List<String> list = new ArrayList<>(); list.add("2,2,2"); list.add("3,3,3"); list.add("4,4,4"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 27.0"); ml.execute(script); }
Example #19
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testInputMatrixBlockDML() { System.out.println("MLContextTest - input MatrixBlock DML"); List<String> list = new ArrayList<>(); list.add("10,20,30"); list.add("40,50,60"); list.add("70,80,90"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Matrix m = new Matrix(dataFrame); MatrixBlock matrixBlock = m.toMatrixBlock(); Script script = dml("avg = avg(M);").in("M", matrixBlock).out("avg"); double avg = ml.execute(script).getDouble("avg"); Assert.assertEquals(50.0, avg, 0.0); }
Example #20
Source File: TestRangeRowRule.java From envelope with Apache License 2.0 | 6 votes |
public void testDontIgnoreNulls() { StructType schema = new StructType(new StructField[] { new StructField("name", DataTypes.StringType, false, Metadata.empty()), new StructField("nickname", DataTypes.StringType, false, Metadata.empty()), new StructField("age", DataTypes.IntegerType, false, Metadata.empty()), new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty()) }); Map<String, Object> configMap = new HashMap<>(); configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age")); configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int"); configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105)); Config config = ConfigFactory.parseMap(configMap); RangeRowRule rule = new RangeRowRule(); assertNoValidationFailures(rule, config); rule.configure(config); rule.configureName("agerange"); Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00")); assertFalse("Row should not pass rule", rule.check(row1)); }
Example #21
Source File: QuaternaryStructureDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset with quaternary structure info * * @param structure * @return dataset quaternary structure info */ public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) { JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t)); StructType schema = new StructType(new StructField[]{ new StructField("structureId", DataTypes.StringType, false, Metadata.empty()), new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()), new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), }); SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, schema); }
Example #22
Source File: ParquetWithSparkSchemaVisitor.java From iceberg with Apache License 2.0 | 5 votes |
private static <T> T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor<T> visitor) { visitor.fieldNames.push(field.getName()); try { return visit(sField.dataType(), field, visitor); } finally { visitor.fieldNames.pop(); } }
Example #23
Source File: SchemaConverter.java From toolbox with Apache License 2.0 | 5 votes |
static StructType getSchema(Attributes atts) { // Generate the schema based on the list of attributes and depending on their type: List<StructField> fields = new ArrayList<StructField>(); for (Attribute att: atts.getFullListOfAttributes()) { if (att.getStateSpaceType().getStateSpaceTypeEnum() == REAL) fields.add(DataTypes.createStructField(att.getName(), DataTypes.DoubleType, true)); else fields.add(DataTypes.createStructField(att.getName(), DataTypes.StringType, true)); } return DataTypes.createStructType(fields); }
Example #24
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
private StructType getExpectedSchemaOfAB(boolean isPredicateStored, boolean isPredicateStoredAsURI) { List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("a" + ID_SUFFIX_JOINER + ID_COLUMN_NAME, DataTypes.LongType, false)); fields.add(DataTypes.createStructField("b" + ID_SUFFIX_JOINER + ID_COLUMN_NAME, DataTypes.LongType, false)); if (isPredicateStored) { fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, isPredicateStoredAsURI ? DataTypes.StringType : DataTypes.IntegerType, false)); } return DataTypes.createStructType(fields); }
Example #25
Source File: InteractionCenter.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a schema to create Spark Datasets. This schema must match the * order in which the data are return by the {@code getAsObject()} method. * * @param index * an integer index to label an interaction center * @return schema to represent an interaction center in a Spark Dataset. */ public static StructField[] getStructFields(int index) { boolean nullable = true; return new StructField[] { DataTypes.createStructField("atom" + index, DataTypes.StringType, nullable), DataTypes.createStructField("element" + index, DataTypes.StringType, nullable), DataTypes.createStructField("group" + index, DataTypes.StringType, nullable), DataTypes.createStructField("groupNum" + index, DataTypes.StringType, nullable), DataTypes.createStructField("type" + index, DataTypes.StringType, nullable), DataTypes.createStructField("chain" + index, DataTypes.StringType, nullable), DataTypes.createStructField("nbFactor" + index, DataTypes.FloatType, nullable)}; }
Example #26
Source File: JavaRFormulaExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaRFormulaExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("country", StringType, false), createStructField("hour", IntegerType, false), createStructField("clicked", DoubleType, false) }); List<Row> data = Arrays.asList( RowFactory.create(7, "US", 18, 1.0), RowFactory.create(8, "CA", 12, 0.0), RowFactory.create(9, "NZ", 15, 0.0) ); Dataset<Row> dataset = spark.createDataFrame(data, schema); RFormula formula = new RFormula() .setFormula("clicked ~ country + hour") .setFeaturesCol("features") .setLabelCol("label"); Dataset<Row> output = formula.fit(dataset).transform(dataset); output.select("features", "label").show(); // $example off$ spark.stop(); }
Example #27
Source File: TestRangeRowRule.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testAgeRangeInt() { StructType schema = new StructType(new StructField[] { new StructField("name", DataTypes.StringType, false, Metadata.empty()), new StructField("nickname", DataTypes.StringType, false, Metadata.empty()), new StructField("age", DataTypes.IntegerType, false, Metadata.empty()), new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty()) }); Map<String, Object> configMap = new HashMap<>(); configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age")); configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int"); configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105)); Config config = ConfigFactory.parseMap(configMap); RangeRowRule rule = new RangeRowRule(); assertNoValidationFailures(rule, config); rule.configure(config); rule.configureName("agerange"); Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34, new BigDecimal("0.00")); assertTrue("Row should pass rule", rule.check(row1)); Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110, new BigDecimal("450.10")); assertFalse("Row should not pass rule", rule.check(row2)); Row row3 = new RowWithSchema(schema, "", "Ian1", 106, new BigDecimal("450.10")); assertFalse("Row should not pass rule", rule.check(row3)); Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 105, new BigDecimal("450.10")); assertTrue("Row should pass rule", rule.check(row4)); }
Example #28
Source File: JavaBucketizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBucketizerExample") .getOrCreate(); // $example on$ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; List<Row> data = Arrays.asList( RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9) ); StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits); // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets"); bucketedData.show(); // $example off$ spark.stop(); }
Example #29
Source File: SparkTypeVisitor.java From iceberg with Apache License 2.0 | 5 votes |
static <T> T visit(DataType type, SparkTypeVisitor<T> visitor) { if (type instanceof StructType) { StructField[] fields = ((StructType) type).fields(); List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.length); for (StructField field : fields) { fieldResults.add(visitor.field( field, visit(field.dataType(), visitor))); } return visitor.struct((StructType) type, fieldResults); } else if (type instanceof MapType) { return visitor.map((MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor)); } else if (type instanceof ArrayType) { return visitor.array( (ArrayType) type, visit(((ArrayType) type).elementType(), visitor)); } else if (type instanceof UserDefinedType) { throw new UnsupportedOperationException( "User-defined types are not supported"); } else { return visitor.atomic(type); } }
Example #30
Source File: RDDConverterUtilsExtTest.java From systemds with Apache License 2.0 | 5 votes |
@Test(expected = SparkException.class) public void testStringDataFrameToVectorDataFrameNonNumbers() { List<String> list = new ArrayList<>(); list.add("[cheeseburger,fries]"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow()); SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema); Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF); // trigger evaluation to throw exception outDF.collectAsList(); }