Java Code Examples for org.apache.spark.sql.types.DataTypes#createStructType()
The following examples show how to use
org.apache.spark.sql.types.DataTypes#createStructType() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProtobufUtils.java From envelope with Apache License 2.0 | 6 votes |
/** * Construct a {@code Dataset} schema from a {@code Descriptor} * <p> * This iterates and recurses through a {@link com.google.protobuf.Descriptors.Descriptor} and produces a * {@link StructType} for {@link org.apache.spark.sql.Dataset<Row>}. * Protobuf {@code oneof} fields are flattened into discrete {@link StructField} instances. * <p> * This will pass the value of {@link Descriptors.FieldDescriptor#isRequired()} to the associated {@link StructField}. * * @param descriptor the Descriptor to convert * @return the converted StructType */ public static StructType buildSchema(Descriptors.Descriptor descriptor) { List<StructField> members = new ArrayList<>(); List<Descriptors.FieldDescriptor> protoFields = descriptor.getFields(); for (Descriptors.FieldDescriptor fieldDescriptor : protoFields) { DataType fieldType = convertType(fieldDescriptor); StructField structField = DataTypes.createStructField(fieldDescriptor.getName(), fieldType, !fieldDescriptor.isRequired()); members.add(structField); LOG.debug("FieldDescriptor[{}] => StructField[{}] ", fieldDescriptor.getFullName(), structField); } if (members.isEmpty()) { throw new RuntimeException("No FieldDescriptors found"); } return DataTypes.createStructType(members.toArray(new StructField[0])); }
Example 2
Source File: TranslateFunction.java From envelope with Apache License 2.0 | 6 votes |
private StructType addFieldNameUnderscores(StructType without) { List<StructField> withFields = Lists.newArrayList(); for (StructField withoutField : without.fields()) { String withName = "_" + withoutField.name(); if (Arrays.asList(without.fieldNames()).contains(withName)) { throw new RuntimeException("Can not append raw field '" + withName + "' because that " + "field already exists as a result of the translation"); } StructField withField = DataTypes.createStructField( withName, withoutField.dataType(), withoutField.nullable(), withoutField.metadata()); withFields.add(withField); } return DataTypes.createStructType(withFields); }
Example 3
Source File: JavaStocks.java From spark-ts-examples with Apache License 2.0 | 6 votes |
private static DataFrame loadObservations(JavaSparkContext sparkContext, SQLContext sqlContext, String path) { JavaRDD<Row> rowRdd = sparkContext.textFile(path).map((String line) -> { String[] tokens = line.split("\t"); ZonedDateTime dt = ZonedDateTime.of(Integer.parseInt(tokens[0]), Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]), 0, 0, 0, 0, ZoneId.systemDefault()); String symbol = tokens[3]; double price = Double.parseDouble(tokens[5]); return RowFactory.create(Timestamp.from(dt.toInstant()), symbol, price); }); List<StructField> fields = new ArrayList(); fields.add(DataTypes.createStructField("timestamp", DataTypes.TimestampType, true)); fields.add(DataTypes.createStructField("symbol", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("price", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); return sqlContext.createDataFrame(rowRdd, schema); }
Example 4
Source File: TestAvroUtils.java From envelope with Apache License 2.0 | 6 votes |
@Test public void toSchemaArraysNested() throws Exception { StructType input = DataTypes.createStructType(Lists.newArrayList( // Outer DataTypes.createStructField("Outer", DataTypes.createArrayType( // Inner DataTypes.createArrayType(DataTypes.IntegerType, false), false), false) )); Schema schema = AvroUtils.schemaFor(input); assertEquals("Invalid field count", 1, schema.getFields().size()); assertEquals("Invalid field name", "Outer", schema.getFields().get(0).name()); assertEquals("Invalid field type", Schema.Type.ARRAY, schema.getFields().get(0).schema().getType()); assertEquals("Invalid outer element type, i.e the inner type", Schema.Type.ARRAY, schema.getFields().get(0).schema().getElementType().getType()); assertEquals("Invalid inner element type", Schema.Type.INT, schema.getFields().get(0).schema().getElementType().getElementType().getType()); //System.out.println(schema.toString(true)); }
Example 5
Source File: IfZeroVectorBridgeTest.java From spark-transformers with Apache License 2.0 | 6 votes |
public DataFrame createDF(JavaRDD<Tuple2<Vector, String>> rdd) { // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField("vectorized_count", new VectorUDT(), true)); fields.add(DataTypes.createStructField("product_title", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows. JavaRDD<Row> rowRDD = rdd.map( new Function<Tuple2<Vector, String>, Row>() { public Row call(Tuple2<Vector, String> record) { return RowFactory.create(record._1(), record._2()); } }); return sqlContext.createDataFrame(rowRDD, schema); }
Example 6
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameGoodMetadataDML() { System.out.println("MLContextTest - DataFrame good metadata DML"); List<String> list = new ArrayList<>(); list.add("10,20,30"); list.add("40,50,60"); list.add("70,80,90"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(3, 3, 9); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 450.0"); ml.execute(script); }
Example 7
Source File: TestSchemaUtils.java From envelope with Apache License 2.0 | 6 votes |
@Test public void testAppendFields() { StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true); StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true); StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true); StructType schema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3)); StructField field4 = DataTypes.createStructField("field4", DataTypes.BooleanType, true); StructField field5 = DataTypes.createStructField("field5", DataTypes.StringType, true); StructType appendSchema = SchemaUtils.appendFields(schema, Lists.newArrayList(field4, field5)); assertEquals(appendSchema.length(), 5); assertEquals(appendSchema.fields()[0], field1); assertEquals(appendSchema.fields()[1], field2); assertEquals(appendSchema.fields()[2], field3); assertEquals(appendSchema.fields()[3], field4); assertEquals(appendSchema.fields()[4], field5); }
Example 8
Source File: TestNanosWithSeqNumTimeModel.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testAppendFields() { StructType withoutSchema = DataTypes.createStructType( Lists.newArrayList( DataTypes.createStructField("other", DataTypes.StringType, true))); Row without = new RowWithSchema(withoutSchema, "hello"); Row with = tm.appendFields(without); assertEquals(with.schema(), withoutSchema.add(nanoField).add(seqNumField)); }
Example 9
Source File: TestRowUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testDifferent() { StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true); StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true); StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true); StructType schema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3)); Row row1 = new RowWithSchema(schema, "hello", 1, 2.0); Row row2 = new RowWithSchema(schema, "hello", 10, -2.0); assertTrue(RowUtils.different(row1, row2, Lists.newArrayList("field1", "field2", "field3"))); assertTrue(!RowUtils.different(row1, row2, Lists.newArrayList("field1"))); }
Example 10
Source File: TestStringDateTimeModel.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testAppendFields() { StructType withoutSchema = DataTypes.createStructType( Lists.newArrayList( DataTypes.createStructField("other", DataTypes.StringType, true))); Row without = new RowWithSchema(withoutSchema, "hello"); Row with = tm.appendFields(without); assertEquals(with.schema(), withoutSchema.add(field)); }
Example 11
Source File: SchemaUtils.java From envelope with Apache License 2.0 | 5 votes |
public static StructType appendFields(StructType from, List<StructField> fields) { StructType to = DataTypes.createStructType(from.fields()); for (StructField field : fields) { to = to.add(field); } return to; }
Example 12
Source File: TestAvroUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test public void toSchemaNullable() throws Exception { StructType input = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("field1", DataTypes.BooleanType, false), DataTypes.createStructField("field2", DataTypes.StringType, true), DataTypes.createStructField("field3", DataTypes.DateType, false), DataTypes.createStructField("field4", DataTypes.TimestampType, false) )); Schema schema = AvroUtils.schemaFor(input); assertEquals("Invalid field count", 4, schema.getFields().size()); // Not nullable assertEquals("Invalid field name", "field1", schema.getFields().get(0).name()); assertEquals("Invalid field type", Schema.Type.BOOLEAN, schema.getFields().get(0).schema().getType()); assertEquals("Invalid field default", null, schema.getFields().get(0).defaultVal()); // Nullable, as opposed to Optional, Avro construct, but no default assertEquals("Invalid nullable (union) type", Schema.Type.UNION, schema.getFields().get(1).schema().getType()); assertEquals("Invalid nullable (union) type count", 2, schema.getFields().get(1).schema().getTypes().size()); assertEquals("Invalid field type", Schema.Type.STRING, schema.getFields().get(1).schema().getTypes().get(0).getType()); assertEquals("Invalid union default", null, schema.getFields().get(1).defaultVal()); //System.out.println(schema.toString(true)); }
Example 13
Source File: TestPivotDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testStaticPivot() throws Exception { List<Row> sourceList = Lists.newArrayList( RowFactory.create("A", "hello", "1"), RowFactory.create("A", "world", "2"), RowFactory.create("B", "hello", "3"), RowFactory.create("C", "world", "4"), RowFactory.create("D", "dummy", "5")); StructType schema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("entity_id", DataTypes.StringType, true), DataTypes.createStructField("key", DataTypes.StringType, true), DataTypes.createStructField("value", DataTypes.StringType, true) )); Dataset<Row> source = Contexts.getSparkSession().createDataFrame(sourceList, schema); Map<String, Dataset<Row>> dependencies = Maps.newHashMap(); dependencies.put("source", source); Config config = ConfigFactory.empty() .withValue(PivotDeriver.STEP_NAME_CONFIG, ConfigValueFactory.fromAnyRef("source")) .withValue(PivotDeriver.ENTITY_KEY_FIELD_NAMES_CONFIG, ConfigValueFactory.fromAnyRef(Lists.newArrayList("entity_id"))) .withValue(PivotDeriver.PIVOT_KEY_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("key")) .withValue(PivotDeriver.PIVOT_VALUE_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("value")) .withValue(PivotDeriver.PIVOT_KEYS_SOURCE_CONFIG, ConfigValueFactory.fromAnyRef(PivotDeriver.PIVOT_KEYS_SOURCE_STATIC)) .withValue(PivotDeriver.PIVOT_KEYS_LIST_CONFIG, ConfigValueFactory.fromAnyRef(Lists.newArrayList("hello", "world"))); PivotDeriver d = new PivotDeriver(); assertNoValidationFailures(d, config); d.configure(config); List<Row> results = d.derive(dependencies).collectAsList(); assertEquals(results.size(), 4); assertTrue(results.contains(RowFactory.create("A", "1", "2"))); assertTrue(results.contains(RowFactory.create("B", "3", null))); assertTrue(results.contains(RowFactory.create("C", null, "4"))); assertTrue(results.contains(RowFactory.create("D", null, null))); }
Example 14
Source File: ValueRow.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override public StructType schema() { StructField[] fields = new StructField[ncols]; for (int i = 0; i < ncols;i++) fields[i] = column[i].getStructField(getNamedColumn(i)); return DataTypes.createStructType(fields); }
Example 15
Source File: ControlDataSet.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
/** * * Not Supported * * @param dsp * @param partitionBy * @param location * @param context * @return */ @Override public DataSet<ExecRow> writeParquetFile(DataSetProcessor dsp, int[] partitionBy, String location, String compression, OperationContext context) { try { //Generate Table Schema String[] colNames; DataValueDescriptor[] dvds; if (context.getOperation() instanceof DMLWriteOperation) { dvds = context.getOperation().getExecRowDefinition().getRowArray(); colNames = ((DMLWriteOperation) context.getOperation()).getColumnNames(); } else if (context.getOperation() instanceof ExportOperation) { dvds = context.getOperation().getLeftOperation().getLeftOperation().getExecRowDefinition().getRowArray(); ExportOperation export = (ExportOperation) context.getOperation(); ResultColumnDescriptor[] descriptors = export.getSourceResultColumnDescriptors(); colNames = new String[descriptors.length]; int i = 0; for (ResultColumnDescriptor rcd : export.getSourceResultColumnDescriptors()) { colNames[i++] = rcd.getName(); } } else { throw new IllegalArgumentException("Unsupported operation type: " + context.getOperation()); } StructField[] fields = new StructField[colNames.length]; for (int i=0 ; i<colNames.length ; i++){ fields[i] = dvds[i].getStructField(colNames[i]); } StructType tableSchema = DataTypes.createStructType(fields); RecordWriter<Void, Object> rw = ParquetWriterService.getFactory().getParquetRecordWriter(location, compression, tableSchema); try { ExpressionEncoder<Row> encoder = RowEncoder.apply(tableSchema); while (iterator.hasNext()) { ValueRow vr = (ValueRow) iterator.next(); context.recordWrite(); rw.write(null, encoder.toRow(vr)); } } finally { rw.close(null); } } catch (Exception e) { throw new RuntimeException(e); } ValueRow valueRow=new ValueRow(1); valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten())); return new ControlDataSet(Collections.singletonList(valueRow).iterator()); }
Example 16
Source File: TestEventTimeHistoryPlanner.java From envelope with Apache License 2.0 | 4 votes |
@Test public void testCarryForwardMultipleWhenNullOutOfOrderMultipleValued() { p = new EventTimeHistoryPlanner(); config = config. withValue(EventTimeHistoryPlanner.CARRY_FORWARD_CONFIG_NAME, ConfigValueFactory.fromAnyRef(true)). withValue(EventTimeHistoryPlanner.VALUE_FIELD_NAMES_CONFIG_NAME, ConfigValueFactory.fromAnyRef(Lists.newArrayList("value1","value2"))); assertNoValidationFailures(p, config); p.configure(config); arrivingSchema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("key", DataTypes.StringType, false), DataTypes.createStructField("value1", DataTypes.StringType, true), DataTypes.createStructField("value2", DataTypes.StringType, true), DataTypes.createStructField("timestamp", DataTypes.LongType, false))); existingSchema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("key", DataTypes.StringType, false), DataTypes.createStructField("value1", DataTypes.StringType, false), DataTypes.createStructField("value2", DataTypes.StringType, false), DataTypes.createStructField("timestamp", DataTypes.LongType, false), DataTypes.createStructField("startdate", DataTypes.LongType, false), DataTypes.createStructField("enddate", DataTypes.LongType, false), DataTypes.createStructField("currentflag", DataTypes.StringType, false), DataTypes.createStructField("lastupdated", DataTypes.StringType, false))); existing.add(new RowWithSchema(existingSchema, "a", "hello1:100", "hello2:100", 100L, 100L, 253402214400000L, EventTimeHistoryPlanner.CURRENT_FLAG_DEFAULT_YES, "")); arriving.add(new RowWithSchema(arrivingSchema, "a", null, "hello2:200", 200L)); arriving.add(new RowWithSchema(arrivingSchema, "a", "hello1:150", null, 150L)); key = new RowWithSchema(keySchema, "a"); List<Row> planned = p.planMutationsForKey(key, arriving, existing); assertEquals(planned.size(), 3); assertEquals(PlannerUtils.getMutationType(planned.get(0)), MutationType.UPDATE); assertEquals(planned.get(0).getAs("value1"), "hello1:100"); assertEquals(planned.get(0).getAs("value2"), "hello2:100"); assertEquals(planned.get(0).getAs("startdate"), 100L); assertEquals(planned.get(0).getAs("enddate"), 149L); assertEquals(planned.get(0).getAs("currentflag"), EventTimeHistoryPlanner.CURRENT_FLAG_DEFAULT_NO); assertEquals(PlannerUtils.getMutationType(planned.get(1)), MutationType.INSERT); assertEquals(planned.get(1).getAs("value1"), "hello1:150"); assertEquals(planned.get(1).getAs("value2"), "hello2:100"); assertEquals(planned.get(1).getAs("startdate"), 150L); assertEquals(planned.get(1).getAs("enddate"), 199L); assertEquals(planned.get(1).getAs("currentflag"), EventTimeHistoryPlanner.CURRENT_FLAG_DEFAULT_NO); assertEquals(PlannerUtils.getMutationType(planned.get(2)), MutationType.INSERT); assertEquals(planned.get(2).getAs("value1"), "hello1:150"); assertEquals(planned.get(2).getAs("value2"), "hello2:200"); assertEquals(planned.get(2).getAs("startdate"), 200L); assertEquals(planned.get(2).getAs("enddate"), 253402214400000L); assertEquals(planned.get(2).getAs("currentflag"), EventTimeHistoryPlanner.CURRENT_FLAG_DEFAULT_YES); }
Example 17
Source File: VideoStreamProcessor.java From video-stream-classification with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { //Read properties Properties prop = PropertyFileReader.readPropertyFile(); //SparkSesion SparkSession spark = SparkSession .builder() .appName("VideoStreamProcessor") .master(prop.getProperty("spark.master.url")) .getOrCreate(); //directory to save image files with motion detected final String processedImageDir = prop.getProperty("processed.output.dir"); logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file."); //create schema for json message StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("cameraId", DataTypes.StringType, true), DataTypes.createStructField("timestamp", DataTypes.TimestampType, true), DataTypes.createStructField("rows", DataTypes.IntegerType, true), DataTypes.createStructField("cols", DataTypes.IntegerType, true), DataTypes.createStructField("type", DataTypes.IntegerType, true), DataTypes.createStructField("data", DataTypes.StringType, true) }); //Create DataSet from stream messages from kafka Dataset<VideoEventData> ds = spark .readStream() .format("kafka") .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers")) .option("subscribe", prop.getProperty("kafka.topic")) .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes")) .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records")) .load() .selectExpr("CAST(value AS STRING) as message") .select(functions.from_json(functions.col("message"),schema).as("json")) .select("json.*") .as(Encoders.bean(VideoEventData.class)); //key-value pair of cameraId-VideoEventData KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() { @Override public String call(VideoEventData value) throws Exception { return value.getCameraId(); } }, Encoders.STRING()); //process Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){ @Override public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception { logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId()); VideoEventData existing = null; //check previous state if (state.exists()) { existing = state.get(); } //classify image VideoEventData processed = ImageProcessor.process(key,values,processedImageDir,existing); //update last processed if(processed != null){ state.update(processed); } return processed; }}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class)); //start StreamingQuery query = processedDataset.writeStream() .outputMode("update") .format("console") .start(); //await query.awaitTermination(); }
Example 18
Source File: AllButEmptyStringAggregationFunction.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
public AllButEmptyStringAggregationFunction() { inputSchema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("value", DataTypes.StringType, true)}); bufferSchema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("currentSelection", DataTypes.StringType, true)}); }
Example 19
Source File: StringDateTimeModel.java From envelope with Apache License 2.0 | 4 votes |
@Override public StructType getSchema() { return DataTypes.createStructType(Lists.newArrayList(field)); }
Example 20
Source File: SparkDataSetTest.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@Test public void testFoobar() { List<Row> foo = new ArrayList(); for (int i = 0; i< 10; i++) { ValueRow row = new ValueRow(1); row.setColumn(1,new SQLInteger(i)); foo.add(row); } StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("col1", DataTypes.IntegerType, true)}); // ValueRow row = new ValueRow(2); // row.setColumn(1,new SQLDouble()); // row.setColumn(2,new SQLInteger()); /* SpliceSpark.getSession().read().parquet("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/simple_parquet") .select(new Column("0"),new Column("1")) .filter(col("0").gt(1).or(col("0").lt(4))).explain(true); */ SpliceSpark.getSessionUnsafe().createDataFrame(foo,schema).write().format("orc").mode(SaveMode.Append) .orc("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/orc_it"); Column filter = (new Column("col1")).gt(1l).and(new Column("col1").lt(1l)); SpliceSpark.getSessionUnsafe().read().schema(schema) .orc("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/orc_it") .filter(filter).show(); // .select(new Column("0"),new Column("1")).show(); /* Dataset<Row> leftSide = SpliceSpark.getSession().createDataFrame(foo,foo.get(0).schema()); Dataset<Row> rightSide = SpliceSpark.getSession().createDataFrame(foo.subList(0,8),foo.get(0).schema()); Column col = (leftSide.col("0").equalTo(rightSide.col("0"))). and((leftSide.col("1")).equalTo(rightSide.col("1"))); leftSide.join(rightSide,col,"inner").explain(true); leftSide.join(rightSide,col,"inner").show(10); leftSide.join(broadcast(rightSide),col,"leftouter").explain(true); leftSide.join(broadcast(rightSide),col,"leftouter").show(10); leftSide.join(broadcast(rightSide),col,"leftanti").show(10); */ }