org.apache.spark.sql.DataFrameReader Java Examples
The following examples show how to use
org.apache.spark.sql.DataFrameReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CsvDFSSource.java From hudi with Apache License 2.0 | 6 votes |
/** * Reads the CSV files and parsed the lines into {@link Dataset} of {@link Row}. * * @param pathStr The list of file paths, separated by ','. * @return {@link Dataset} of {@link Row} containing the records. */ private Option<Dataset<Row>> fromFiles(Option<String> pathStr) { if (pathStr.isPresent()) { DataFrameReader dataFrameReader = sparkSession.read().format("csv"); CSV_CONFIG_KEYS.forEach(optionKey -> { String configPropName = CSV_SRC_CONFIG_PREFIX + optionKey; String value = props.getString(configPropName, null); // Pass down the Hudi CSV configs to Spark DataFrameReader if (value != null) { dataFrameReader.option(optionKey, value); } }); if (sourceSchema != null) { // Source schema is specified, pass it to the reader dataFrameReader.schema(sourceSchema); } dataFrameReader.option("inferSchema", Boolean.toString(sourceSchema == null)); return Option.of(dataFrameReader.load(pathStr.get().split(","))); } else { return Option.empty(); } }
Example #2
Source File: HiveWarehouseSessionImpl.java From spark-llap with Apache License 2.0 | 5 votes |
public Dataset<Row> table(String sql) { DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("table", sql); return dfr.load(); }
Example #3
Source File: HoodieIncrSource.java From hudi with Apache License 2.0 | 4 votes |
@Override public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) { DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH)); /* * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH, * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields = * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS)); */ String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH); int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH); boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT, Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT); // Use begin Instant if set and non-empty Option<String> beginInstant = lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty(); Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt); if (instantEndpts.getKey().equals(instantEndpts.getValue())) { LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey()); return Pair.of(Option.empty(), instantEndpts.getKey()); } // Do Incr pull. Set end instant if available DataFrameReader reader = sparkSession.read().format("org.apache.hudi") .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft()) .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight()); Dataset<Row> source = reader.load(srcPath); /* * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); * * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema * = newSchema.add(field, DataTypes.StringType, true); } * * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if * configured * * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath * = row.getString(3); List<Object> partitionVals = * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object) * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(), * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema)); * * log.info("Validated Source Schema :" + validated.schema()); */ // Remove Hoodie meta columns except partition path from input source final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream() .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new)); // log.info("Final Schema from Source is :" + src.schema()); return Pair.of(Option.of(src), instantEndpts.getRight()); }
Example #4
Source File: HiveWarehouseSessionImpl.java From spark-llap with Apache License 2.0 | 4 votes |
public Dataset<Row> executeQuery(String sql) { DataFrameReader dfr = session().read().format(HIVE_WAREHOUSE_CONNECTOR_INTERNAL).option("query", sql); return dfr.load(); }
Example #5
Source File: DataFrameCreate.java From SparkDemo with MIT License | 3 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(DataFrameCreate.class); // 创建DataFrame 读取json SQLContext sqlContext = new SQLContext(sc); DataFrameReader dataFrameReader = sqlContext.read(); Dataset<Row> dataset = dataFrameReader.json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); dataset.show(); sc.close(); }