org.apache.spark.sql.types.StructType Java Examples
The following examples show how to use
org.apache.spark.sql.types.StructType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public StreamWriter createStreamWriter(String runId, StructType dsStruct, OutputMode mode, DataSourceOptions options) { Preconditions.checkArgument( mode == OutputMode.Append() || mode == OutputMode.Complete(), "Output mode %s is not supported", mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options)); SparkUtil.validatePartitionTransforms(table.spec()); // Spark 2.4.x passes runId to createStreamWriter instead of real queryId, // so we fetch it directly from sparkContext to make writes idempotent String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY()); String appId = lazySparkSession().sparkContext().applicationId(); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct); }
Example #2
Source File: BigQueryDataSourceReader.java From spark-bigquery-connector with Apache License 2.0 | 6 votes |
public BigQueryDataSourceReader( TableInfo table, BigQueryClient bigQueryClient, BigQueryReadClientFactory bigQueryReadClientFactory, ReadSessionCreatorConfig readSessionCreatorConfig, Optional<String> globalFilter, Optional<StructType> schema) { this.table = table; this.tableId = table.getTableId(); this.readSessionCreatorConfig = readSessionCreatorConfig; this.bigQueryClient = bigQueryClient; this.bigQueryReadClientFactory = bigQueryReadClientFactory; this.readSessionCreator = new ReadSessionCreator(readSessionCreatorConfig, bigQueryClient, bigQueryReadClientFactory); this.globalFilter = globalFilter; this.schema = schema; }
Example #3
Source File: ParquetWithSparkSchemaVisitor.java From iceberg with Apache License 2.0 | 6 votes |
private static <T> List<T> visitFields(StructType struct, GroupType group, ParquetWithSparkSchemaVisitor<T> visitor) { StructField[] sFields = struct.fields(); Preconditions.checkArgument(sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List<T> results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (int i = 0; i < sFields.length; i += 1) { Type field = group.getFields().get(i); StructField sField = sFields[i]; Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), "Structs do not match: field %s != %s", field.getName(), sField.name()); results.add(visitField(sField, field, visitor)); } return results; }
Example #4
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #5
Source File: ProtobufUtils.java From envelope with Apache License 2.0 | 6 votes |
/** * Retrieves and converts Protobuf fields from a Message. * <p> * If the field in the {@link com.google.protobuf.Descriptors.Descriptor} exists in the {@link Message}, the value is * retrieved and converted using {@link #getFieldValue(Descriptors.FieldDescriptor, Object, DataType)}. * Otherwise, the field value is {@code null}. * The extraction honors the order of the {@code Descriptor}. * * @param dsc the Protobuf Descriptor with all fields * @param msg the Message with the current field values * @param schema the Dataset schema derived from the Descriptor * @return a list of converted values */ public static List<Object> buildRowValues(Descriptors.Descriptor dsc, Message msg, StructType schema) { List<Object> values = new ArrayList<>(); Object val; for (Descriptors.FieldDescriptor fd : dsc.getFields()) { if ( (!fd.isRepeated() && msg.hasField(fd)) || (fd.isRepeated() && msg.getRepeatedFieldCount(fd) > 0) ) { val = getFieldValue(fd, msg.getField(fd), schema.apply(fd.getName()).dataType()); } else { LOG.trace("FieldDescriptor[{}] => not found", fd.getFullName()); val = null; } values.add(val); } return values; }
Example #6
Source File: NManualBuildAndQueryCuboidTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName, org.apache.spark.sql.types.DataType dataType) { StructField[] structFieldList = layoutDs.schema().fields(); String[] columns = layoutDs.columns(); int index = 0; StructField[] outStructFieldList = new StructField[structFieldList.length]; for (int i = 0; i < structFieldList.length; i++) { if (columns[i].equalsIgnoreCase(fieldName)) { index = i; StructField structField = structFieldList[i]; outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata()); } else { outStructFieldList[i] = structFieldList[i]; } } OUT_SCHEMA = new StructType(outStructFieldList); return index; }
Example #7
Source File: JavaBinarizerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBinarizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); binarizedDataFrame.show(); // $example off$ spark.stop(); }
Example #8
Source File: SchemaUtilTest.java From spark-llap with Apache License 2.0 | 6 votes |
@Test public void testBuildHiveCreateTableQueryFromSparkDFSchema() { HiveWarehouseSessionState sessionState = HiveWarehouseBuilder .session(session) .userPassword(TEST_USER, TEST_PASSWORD) .hs2url(TEST_HS2_URL) .dbcp2Conf(TEST_DBCP2_CONF) .maxExecResults(TEST_EXEC_RESULTS_MAX) .defaultDB(TEST_DEFAULT_DB) .sessionStateForTest(); HiveWarehouseSession hive = new MockHiveWarehouseSessionImpl(sessionState); HiveWarehouseSessionImpl.HIVE_WAREHOUSE_CONNECTOR_INTERNAL = "com.hortonworks.spark.sql.hive.llap.MockHiveWarehouseConnector"; StructType schema = getSchema(); String query = SchemaUtil.buildHiveCreateTableQueryFromSparkDFSchema(schema, "testDB", "testTable"); System.out.println("create table query:" + query); assertTrue(hive.executeUpdate(query)); }
Example #9
Source File: SparkParquetReadersFlatDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Benchmark @Threads(1) public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA); try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile)) .project(PROJECTED_SCHEMA) .readSupport(new ParquetReadSupport()) .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) .set("spark.sql.parquet.binaryAsString", "false") .set("spark.sql.parquet.int96AsTimestamp", "false") .callInit() .build()) { for (InternalRow row : rows) { blackhole.consume(row); } } }
Example #10
Source File: TestMorphlineTranslator.java From envelope with Apache License 2.0 | 6 votes |
@Test public void getSchema() throws Exception { Map<String, Object> configMap = Maps.newHashMap(); configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8"); configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8"); configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE)); configMap.put(MorphlineTranslator.MORPHLINE_ID, "default"); configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat"); configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG, Lists.newArrayList("bar", "foo")); configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG, Lists.newArrayList("int", "string")); Config config = ConfigFactory.parseMap(configMap); translator.configure(config); StructType schema = translator.getProvidingSchema(); Assert.assertEquals("Invalid number of SchemaFields", 2, schema.fields().length); Assert.assertEquals("Invalid DataType", DataTypes.IntegerType, schema.fields()[0].dataType()); Assert.assertEquals("Invalid DataType", DataTypes.StringType, schema.fields()[1].dataType()); }
Example #11
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #12
Source File: RowProcessor.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
@Override public void call(JavaRDD<String> rdd) throws Exception { JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() { private static final long serialVersionUID = 5167089361335095997L; @Override public Row call(String msg) { Row row = RowFactory.create(msg); return row; } }); // Create Schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) }); // Get Spark 2.0 session SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context() .getConf()); Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema); msgDataFrame.show(); }
Example #13
Source File: HoodieReadClient.java From hudi with Apache License 2.0 | 6 votes |
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame. * * @return a dataframe */ public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) { assertSqlContext(); JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable); JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD = lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2))); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
Example #14
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private static MapPartitionsFunction<Row, ManifestFile> toManifests( Broadcast<FileIO> io, long maxNumManifestEntries, String location, int format, PartitionSpec spec, StructType sparkType) { return (MapPartitionsFunction<Row, ManifestFile>) rows -> { List<Row> rowsAsList = Lists.newArrayList(rows); if (rowsAsList.isEmpty()) { return Collections.emptyIterator(); } List<ManifestFile> manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); }; }
Example #15
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column"); List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #16
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column"); List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>(); list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #17
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified"); List<String> list = new ArrayList<>(); list.add("2,2,2"); list.add("3,3,3"); list.add("4,4,4"); JavaRDD<String> javaRddString = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 27.0"); ml.execute(script); }
Example #18
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #19
Source File: ExternalTableUtils.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static StructType supportAvroDateType(StructType schema, String storedAs) { if (storedAs.toLowerCase().equals("a")) { for (int i = 0; i < schema.size(); i++) { StructField column = schema.fields()[i]; if (column.dataType().equals(DataTypes.DateType)) { StructField replace = DataTypes.createStructField(column.name(), DataTypes.StringType, column.nullable(), column.metadata()); schema.fields()[i] = replace; } } } return schema; }
Example #20
Source File: SchemaUtils.java From envelope with Apache License 2.0 | 5 votes |
public static StructType subsetSchema(StructType schema, final List<String> fieldNames) { Seq<StructField> fieldSeq = schema.toTraversable().filter(new AbstractFunction1<StructField, Object>() { @Override public Object apply(StructField field) { return fieldNames.contains(field.name()); } }).toSeq(); StructType subset = DataTypes.createStructType(JavaConversions.seqAsJavaList(fieldSeq)); return subset; }
Example #21
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 5 votes |
/** * Add element indices as new column to DataFrame * * @param df input data frame * @param sparkSession the Spark Session * @param nameOfCol name of index column * @return new data frame */ public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) { StructField[] oldSchema = df.schema().fields(); StructField[] newSchema = new StructField[oldSchema.length + 1]; for(int i = 0; i < oldSchema.length; i++) { newSchema[i] = oldSchema[i]; } newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false); // JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID()); JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID()); return sparkSession.createDataFrame(newRows, new StructType(newSchema)); }
Example #22
Source File: TestAvroUtils.java From envelope with Apache License 2.0 | 5 votes |
@Test public void toSchemaStructTypesNested() throws Exception { StructType input = DataTypes.createStructType(Lists.newArrayList( // Outer DataTypes.createStructField("Outer", DataTypes.createStructType( Lists.newArrayList( // Inner DataTypes.createStructField("Inner", DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("field1", DataTypes.IntegerType, false) )), false) )), false) ) ); Schema schema = AvroUtils.schemaFor(input); assertEquals("Invalid outer record name", "record0", schema.getName()); assertEquals("Invalid outer field count", 1, schema.getFields().size()); assertEquals("Invalid outer field name", "Outer", schema.getFields().get(0).name()); assertEquals("Invalid outer field type", Schema.Type.RECORD, schema.getFields().get(0).schema().getType()); assertEquals("Invalid inner record name", "record1", schema.getFields().get(0).schema().getName()); assertEquals("Invalid inner field count", 1, schema.getFields().get(0).schema().getFields().size()); assertEquals("Invalid inner field name", "Inner", schema.getFields().get(0).schema().getFields().get(0).name()); assertEquals("Invalid inner field type", Schema.Type.RECORD, schema.getFields().get(0).schema().getFields().get(0).schema().getType()); assertEquals("Invalid inner record name", "record2", schema.getFields().get(0).schema().getFields().get(0).schema().getName()); assertEquals("Invalid nested field count", 1, schema.getFields().get(0).schema().getFields().get(0).schema().getFields().size()); assertEquals("Invalid nested field name", "field1", schema.getFields().get(0).schema().getFields().get(0).schema().getFields().get(0).name()); assertEquals("Invalid nested field type", Schema.Type.INT, schema.getFields().get(0).schema().getFields().get(0).schema().getFields().get(0).schema().getType()); //System.out.println(schema.toString(true)); }
Example #23
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 5 votes |
/** * Add element indices as new column to DataFrame * * @param df input data frame * @param sparkSession the Spark Session * @param nameOfCol name of index column * @return new data frame */ public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) { StructField[] oldSchema = df.schema().fields(); StructField[] newSchema = new StructField[oldSchema.length + 1]; for(int i = 0; i < oldSchema.length; i++) { newSchema[i] = oldSchema[i]; } newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false); // JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID()); JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID()); return sparkSession.createDataFrame(newRows, new StructType(newSchema)); }
Example #24
Source File: TranslateFunction.java From envelope with Apache License 2.0 | 5 votes |
@Override public void receiveProvidedSchema(StructType providedSchema) { this.providedSchema = providedSchema; if (getTranslator() instanceof UsesProvidedSchema) { ((UsesProvidedSchema)getTranslator()).receiveProvidedSchema(providedSchema); } }
Example #25
Source File: CsvDFSSource.java From hudi with Apache License 2.0 | 5 votes |
public CsvDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration()); if (schemaProvider != null) { sourceSchema = (StructType) SchemaConverters.toSqlType(schemaProvider.getSourceSchema()) .dataType(); } else { sourceSchema = null; } }
Example #26
Source File: AbstractJavaEsSparkSQLTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
private Dataset<Row> artistsAsDataset() throws Exception { // don't use the sc.textFile as it pulls in the Hadoop madness (2.x vs 1.x) Path path = Paths.get(testData.sampleArtistsDatUri()); // because Windows... List<String> lines = Files.readAllLines(path, StandardCharsets.ISO_8859_1); JavaRDD<String> data = sc.parallelize(lines); StructType schema = DataTypes .createStructType(new StructField[] { DataTypes.createStructField("id", DataTypes.IntegerType, false), DataTypes.createStructField("name", DataTypes.StringType, false), DataTypes.createStructField("url", DataTypes.StringType, true), DataTypes.createStructField("pictures", DataTypes.StringType, true), DataTypes.createStructField("time", DataTypes.TimestampType, true) }); JavaRDD<Row> rowData = data.map(new Function<String, String[]>() { @Override public String[] call(String line) throws Exception { return line.split("\t"); } }).map(new Function<String[], Row>() { @Override public Row call(String[] r) throws Exception { return RowFactory.create(Integer.parseInt(r[0]), r[1], r[2], r[3], new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis())); } }); return sqc.createDataFrame(rowData, schema); }
Example #27
Source File: SparkTable.java From iceberg with Apache License 2.0 | 5 votes |
public SparkTable(Table icebergTable, StructType requestedSchema) { this.icebergTable = icebergTable; this.requestedSchema = requestedSchema; if (requestedSchema != null) { // convert the requested schema to throw an exception if any requested fields are unknown SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema); } }
Example #28
Source File: JavaDCTExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaDCTExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); DCT dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false); Dataset<Row> dctDf = dct.transform(df); dctDf.select("featuresDCT").show(false); // $example off$ spark.stop(); }
Example #29
Source File: KafkaInput.java From envelope with Apache License 2.0 | 5 votes |
@Override public void receiveExpectedSchema(StructType expectedSchema) { this.expectedSchema = expectedSchema; List<String> fieldNames = Lists.newArrayList(KEY_FIELD_NAME, Translator.VALUE_FIELD_NAME); for (String fieldName : fieldNames) { if (Lists.newArrayList(expectedSchema.fieldNames()).contains(fieldName)) { DataType fieldDataType = expectedSchema.fields()[expectedSchema.fieldIndex(fieldName)].dataType(); if (fieldDataType.equals(DataTypes.StringType)) { kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); } else if (fieldDataType.equals(DataTypes.BinaryType)) { kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); } else { throw new RuntimeException("Translator expects '" + fieldName + "' field to be of type '" + fieldDataType + "' but Kafka input only supports providing '" + fieldName + "' field as either string or binary."); } } else { // If the translator doesn't expect the field then provide it as binary kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer"); } } }
Example #30
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 5 votes |
@Override public SparkTable getTable(StructType schema, Transform[] partitioning, Map<String, String> options) { // TODO: if partitioning is non-null, the table is being created? // Get Iceberg table from options Configuration conf = new Configuration(SparkSession.active().sparkContext().hadoopConfiguration()); Table icebergTable = getTableAndResolveHadoopConfiguration(options, conf); // Build Spark table based on Iceberg table, and return it return new SparkTable(icebergTable, schema); }