Java Code Examples for org.apache.spark.sql.Dataset#map()
The following examples show how to use
org.apache.spark.sql.Dataset#map() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CsvToDatasetBookAsJson.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "CSV to Dataset<Book> as JSON").master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING()); bookDf.show(20, 132); Dataset<Row> bookAsJsonDf = spark.read().json(bookDf); bookAsJsonDf.show(); }
Example 2
Source File: WindowAssignTranslatorBatch.java From beam with Apache License 2.0 | 6 votes |
@Override public void translateTransform( PTransform<PCollection<T>, PCollection<T>> transform, TranslationContext context) { Window.Assign<T> assignTransform = (Window.Assign<T>) transform; @SuppressWarnings("unchecked") final PCollection<T> input = (PCollection<T>) context.getInput(); @SuppressWarnings("unchecked") final PCollection<T> output = (PCollection<T>) context.getOutput(); Dataset<WindowedValue<T>> inputDataset = context.getDataset(input); if (WindowingHelpers.skipAssignWindows(assignTransform, context)) { context.putDataset(output, inputDataset); } else { WindowFn<T, ?> windowFn = assignTransform.getWindowFn(); WindowedValue.FullWindowedValueCoder<T> windowedValueCoder = WindowedValue.FullWindowedValueCoder.of(input.getCoder(), windowFn.windowCoder()); Dataset<WindowedValue<T>> outputDataset = inputDataset.map( WindowingHelpers.assignWindowsMapFunction(windowFn), EncoderHelpers.fromBeamCoder(windowedValueCoder)); context.putDataset(output, outputDataset); } }
Example 3
Source File: CsvToDatasetBookToDataframeApp.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); Dataset<Row> df2 = bookDs.toDF(); df2.show(); df2.printSchema(); }
Example 4
Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) { Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures(); for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) { FunctionDesc functionDesc = entry.getValue(); if (functionDesc != null) { final String[] columns = layoutDs.columns(); String functionName = functionDesc.returnType().dataType(); if ("bitmap".equals(functionName)) { final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType); PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null); layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> { Object[] ret = new Object[value.size()]; for (int i = 0; i < columns.length; i++) { if (i == finalIndex) { byte[] bytes = (byte[]) value.get(i); Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes); ret[i] = bitmapCounter.getLongCardinality(); } else { ret[i] = value.get(i); } } return RowFactory.create(ret); }, RowEncoder.apply(OUT_SCHEMA)); } } } return layoutDs; }
Example 5
Source File: BookUrlBuilderApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("Book URL Builder") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING()); ds.printSchema(); ds.show(20, 80); }
Example 6
Source File: DataSetApplication.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Person person = new Person("spark",10); Encoder<Person> encoder = Encoders.bean(Person.class); Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder); dataset.show(); //最终输出 {name:spark;age:10} /*常见类型的编码器*/ Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder); Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) { return value+1; } },integerEncoder); result.collect(); //最终输出 [2,3] /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/ String url = "/usr/local/text.json"; Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder); personDataset.show(); //最终输出 name:... age:,,,, }
Example 7
Source File: HashDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { String concatenatedFieldName = "_concatenated"; Dataset<Row> dependency = getStepDataFrame(dependencies); Dataset<Row> concatenated = dependency.map( new ConcatenationFunction(delimiter, nullString, includeFields, excludeFields), RowEncoder.apply(dependency.schema().add(concatenatedFieldName, DataTypes.BinaryType))); return concatenated .withColumn(hashFieldName, functions.md5(functions.col(concatenatedFieldName))) .drop(concatenatedFieldName); }
Example 8
Source File: DataQualityDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (dependencies.size() > 1 && dataset.isEmpty()) { throw new RuntimeException("Must specify dataset on which to conduct data quality tests when more than one dependency"); } Dataset<Row> theDataset; Dataset<Row> theResults = null; if (dependencies.size() == 1) { theDataset = dependencies.values().iterator().next(); } else { theDataset = dependencies.get(dataset); } if (scope == Scope.DATASET) { // The checks are run at a dataset level and we are simply returning a DS of <name, boolean> Rows for (DatasetRule rule : datasetRules.values()) { if (theResults == null) { theResults = rule.check(theDataset, dependencies); } else { theResults = theResults.union(rule.check(theDataset, dependencies)); } } } else { if (theDataset.schema().getFieldIndex(resultsField).isDefined()) { throw new RuntimeException("The field [" + resultsField + "] already exists in the dataset schema. Use the " + RESULTS_FIELD_CONFIG + " configuration parameter to customize the data quality check field name"); } List<StructField> checkField = Lists.newArrayList( new StructField(resultsField, DataTypes.createMapType(DataTypes.StringType, DataTypes.BooleanType), false, Metadata.empty())); theResults = theDataset.map(new CheckRowRules(rowRules, resultsField), RowEncoder.apply(SchemaUtils.appendFields(theDataset.schema(), checkField))); } return theResults; }
Example 9
Source File: ValueSets.java From bunsen with Apache License 2.0 | 5 votes |
/** * Returns a new ValueSets instance that includes the given value sets. * * @param valueSets the value sets to add to the returned collection. * @return a new ValueSets instance with the added value sets. */ @Override public ValueSets withValueSets(Dataset<ValueSet> valueSets) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets); // Ensure that there are no duplicates among the value sets if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add value sets having duplicate valueSetUri and valueSetVersion"); } // The value set concepts will be stored in the values table for persistence, so we remove // them from the individual value sets. This can be done most easily by setting concepts to an // empty list. Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> { ValueSet valueSetWithoutConcepts = valueSet.copy(); List<ConceptSetComponent> updatedInclusions = new ArrayList<>(); for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) { ConceptSetComponent inclusionWithoutConcepts = inclusion.copy(); inclusionWithoutConcepts.setConcept(new ArrayList<>()); updatedInclusions.add(inclusionWithoutConcepts); } valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions); return valueSetWithoutConcepts; }, VALUE_SET_ENCODER); Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator, getValueEncoder()); return withValueSets(withoutConcepts, newValues); }
Example 10
Source File: ConceptMaps.java From bunsen with Apache License 2.0 | 5 votes |
@Override public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. Dataset<ConceptMap> withoutConcepts = conceptMaps .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> { // Remove the elements rather than the groups to preserved the // "unmapped" structure in a group that can refer to other // concept maps. ConceptMap withoutElements = conceptMap.copy(); List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>(); for (ConceptMapGroupComponent group: withoutElements.getGroup()) { group.setElement(new ArrayList<>()); updatedGroups.add(group); } withoutElements.setGroup(updatedGroups); return withoutElements; }, CONCEPT_MAP_ENCODER); Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator, MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
Example 11
Source File: ReadSourceTranslatorBatch.java From beam with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public void translateTransform( PTransform<PBegin, PCollection<T>> transform, TranslationContext context) { AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform = (AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>) context.getCurrentTransform(); BoundedSource<T> source; try { source = ReadTranslation.boundedSourceFromTransform(rootTransform); } catch (IOException e) { throw new RuntimeException(e); } SparkSession sparkSession = context.getSparkSession(); String serializedSource = Base64Serializer.serializeUnchecked(source); Dataset<Row> rowDataset = sparkSession .read() .format(sourceProviderClass) .option(DatasetSourceBatch.BEAM_SOURCE_OPTION, serializedSource) .option( DatasetSourceBatch.DEFAULT_PARALLELISM, String.valueOf(context.getSparkSession().sparkContext().defaultParallelism())) .option( DatasetSourceBatch.PIPELINE_OPTIONS, context.getSerializableOptions().toString()) .load(); // extract windowedValue from Row WindowedValue.FullWindowedValueCoder<T> windowedValueCoder = WindowedValue.FullWindowedValueCoder.of( source.getOutputCoder(), GlobalWindow.Coder.INSTANCE); Dataset<WindowedValue<T>> dataset = rowDataset.map( RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder), EncoderHelpers.fromBeamCoder(windowedValueCoder)); PCollection<T> output = (PCollection<T>) context.getOutput(); context.putDataset(output, dataset); }
Example 12
Source File: PiComputeLambdaApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
/** * The processing code. */ private void start(int slices) { int numberOfThrows = 100000 * slices; System.out.println("About to throw " + numberOfThrows + " darts, ready? Stay away from the target!"); long t0 = System.currentTimeMillis(); SparkSession spark = SparkSession .builder() .appName("Spark Pi with lambdas") .master("local[*]") .getOrCreate(); long t1 = System.currentTimeMillis(); System.out.println("Session initialized in " + (t1 - t0) + " ms"); List<Integer> l = new ArrayList<>(numberOfThrows); for (int i = 0; i < numberOfThrows; i++) { l.add(i); } Dataset<Row> incrementalDf = spark .createDataset(l, Encoders.INT()) .toDF(); long t2 = System.currentTimeMillis(); System.out.println("Initial dataframe built in " + (t2 - t1) + " ms"); Dataset<Integer> dotsDs = incrementalDf .map((MapFunction<Row, Integer>) status -> { double x = Math.random() * 2 - 1; double y = Math.random() * 2 - 1; counter++; if (counter % 100000 == 0) { System.out.println("" + counter + " darts thrown so far"); } return (x * x + y * y <= 1) ? 1 : 0; }, Encoders.INT()); long t3 = System.currentTimeMillis(); System.out.println("Throwing darts done in " + (t3 - t2) + " ms"); int dartsInCircle = dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y); long t4 = System.currentTimeMillis(); System.out.println("Analyzing result in " + (t4 - t3) + " ms"); System.out.println( "Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows); spark.stop(); }
Example 13
Source File: ParDoTranslatorBatch.java From beam with Apache License 2.0 | 4 votes |
@Override public void translateTransform( PTransform<PCollection<InputT>, PCollectionTuple> transform, TranslationContext context) { String stepName = context.getCurrentTransform().getFullName(); // Check for not supported advanced features // TODO: add support of Splittable DoFn DoFn<InputT, OutputT> doFn = getDoFn(context); checkState( !DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn); // TODO: add support of states and timers checkState( !DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment."); checkState( !DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment"); DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform()); // Init main variables PValue input = context.getInput(); Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input); Map<TupleTag<?>, PValue> outputs = context.getOutputs(); TupleTag<?> mainOutputTag = getTupleTag(context); List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet()); WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy(); Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder(); Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder(); // construct a map from side input to WindowingStrategy so that // the DoFn runner can map main-input windows to side input windows List<PCollectionView<?>> sideInputs = getSideInputs(context); Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>(); for (PCollectionView<?> sideInput : sideInputs) { sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy()); } SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context); Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders(); MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance(); List<TupleTag<?>> additionalOutputTags = new ArrayList<>(); for (TupleTag<?> tag : outputTags) { if (!tag.equals(mainOutputTag)) { additionalOutputTags.add(tag); } } Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform()); @SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction( metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping); MultiOuputCoder multipleOutputCoder = MultiOuputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder); Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder)); if (outputs.entrySet().size() > 1) { allOutputs.persist(); for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) { pruneOutputFilteredByTag(context, allOutputs, output, windowCoder); } } else { Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder(); Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder); Dataset<WindowedValue<?>> outputDataset = allOutputs.map( (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder)); context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset); } }
Example 14
Source File: PiComputeLambdaWithClassApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
/** * The processing code. */ private void start(int slices) { int numberOfThrows = 100000 * slices; System.out.println("About to throw " + numberOfThrows + " darts, ready? Stay away from the target!"); long t0 = System.currentTimeMillis(); SparkSession spark = SparkSession .builder() .appName("Spark Pi with lambdas") .master("local[*]") .getOrCreate(); long t1 = System.currentTimeMillis(); System.out.println("Session initialized in " + (t1 - t0) + " ms"); List<Integer> l = new ArrayList<>(numberOfThrows); for (int i = 0; i < numberOfThrows; i++) { l.add(i); } Dataset<Row> incrementalDf = spark .createDataset(l, Encoders.INT()) .toDF(); long t2 = System.currentTimeMillis(); System.out.println("Initial dataframe built in " + (t2 - t1) + " ms"); Dataset<Integer> dotsDs = incrementalDf .map(new DartMapper(), Encoders.INT()); long t3 = System.currentTimeMillis(); System.out.println("Throwing darts done in " + (t3 - t2) + " ms"); int dartsInCircle = dotsDs.reduce(new DartReducer()); long t4 = System.currentTimeMillis(); System.out.println("Analyzing result in " + (t4 - t3) + " ms"); System.out .println("Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows); spark.stop(); }
Example 15
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext() .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1) .toJavaRDD(); // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
Example 16
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32); // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset( Collections.singletonList(person), personEncoder ); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
Example 17
Source File: JavaSparkHiveExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "spark-warehouse"; SparkSession spark = SparkSession .builder() .appName("Java Spark Hive Example") .config("spark.sql.warehouse.dir", warehouseLocation) .enableHiveSupport() .getOrCreate(); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // ... // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a SparkSession. List<Record> records = new ArrayList<>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
Example 18
Source File: ReadSourceTranslatorStreaming.java From beam with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public void translateTransform( PTransform<PBegin, PCollection<T>> transform, TranslationContext context) { AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform = (AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>) context.getCurrentTransform(); UnboundedSource<T, UnboundedSource.CheckpointMark> source; try { source = ReadTranslation.unboundedSourceFromTransform(rootTransform); } catch (IOException e) { throw new RuntimeException(e); } SparkSession sparkSession = context.getSparkSession(); String serializedSource = Base64Serializer.serializeUnchecked(source); Dataset<Row> rowDataset = sparkSession .readStream() .format(sourceProviderClass) .option(DatasetSourceStreaming.BEAM_SOURCE_OPTION, serializedSource) .option( DatasetSourceStreaming.DEFAULT_PARALLELISM, String.valueOf(context.getSparkSession().sparkContext().defaultParallelism())) .option( DatasetSourceStreaming.PIPELINE_OPTIONS, context.getSerializableOptions().toString()) .load(); // extract windowedValue from Row WindowedValue.FullWindowedValueCoder<T> windowedValueCoder = WindowedValue.FullWindowedValueCoder.of( source.getOutputCoder(), GlobalWindow.Coder.INSTANCE); Dataset<WindowedValue<T>> dataset = rowDataset.map( RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder), EncoderHelpers.fromBeamCoder(windowedValueCoder)); PCollection<T> output = (PCollection<T>) context.getOutput(); context.putDataset(output, dataset); }
Example 19
Source File: StructuredKafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
public Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig08 config, SourceContext context) { String topics = requireNonNull(config.getTopics(), "topics not setting"); String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器 String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字 String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting"); Map<String, String> otherConfig = config.getOtherConfig().entrySet() .stream() .filter(x -> x.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString())); Map<String, String> kafkaParams = new HashMap<>(otherConfig); kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest smallest Dataset<Row> kafka08 = spark.readStream() .format(KafkaDataSource08.class.getName()) .option("topics", topics) .options(kafkaParams) .load(); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return kafka08 .map((MapFunction<Row, Row>) record -> { return jsonParser.deserialize( record.getAs("_key"), record.getAs("_message"), record.<String>getAs("_topic"), record.<Integer>getAs("_partition"), record.<Long>getAs("_offset")); }, RowEncoder.apply(jsonParser.getProducedType())); } else { StructType structType = schemaToSparkType(context.getSchema()); String[] columns = Arrays.stream(structType.names()).map(name -> { switch (name) { case "_key": return "CAST(_key AS STRING) as _key"; case "_message": return "CAST(_message AS STRING) as _message"; default: return name; } }).toArray(String[]::new); return kafka08.selectExpr(columns); //对输入的数据进行 cast转换 } }
Example 20
Source File: Functions.java From bunsen with Apache License 2.0 | 2 votes |
/** * Converts a set of FHIR resources to JSON. * * @param dataset a dataset containing FHIR resources * @param resourceTypeUrl the FHIR resource type * @return a dataset of JSON strings for the FHIR resources */ public static Dataset<String> toJson(Dataset<Row> dataset, String resourceTypeUrl) { return dataset.map(new ToJson(resourceTypeUrl), Encoders.STRING()); }