org.apache.spark.sql.Encoders Java Examples
The following examples show how to use
org.apache.spark.sql.Encoders.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BundlesTest.java From bunsen with Apache License 2.0 | 6 votes |
@Test public void testJsonBundleStrings() { JavaRDD<String> jsonBundlesRdd = spark.sparkContext() .wholeTextFiles("src/test/resources/json/bundles", 1) .toJavaRDD() .map(tuple -> tuple._2()); Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(), Encoders.STRING()); jsonBundles.write().saveAsTable("json_bundle_table"); JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson( spark.sql("select value from json_bundle_table"), "value"); Dataset<Patient> patients = BundlesTest.bundles.extractEntry(spark, bundlesRdd, Patient.class); checkPatients(patients); }
Example #2
Source File: BundlesTest.java From bunsen with Apache License 2.0 | 6 votes |
@Test public void testJsonBundleStrings() { JavaRDD<String> jsonBundlesRdd = spark.sparkContext() .wholeTextFiles("src/test/resources/json/bundles", 1) .toJavaRDD() .map(tuple -> tuple._2()); Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(), Encoders.STRING()); jsonBundles.write().saveAsTable("json_bundle_table"); JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson( spark.sql("select value from json_bundle_table"), "value"); Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark, bundlesRdd, Patient.class); checkPatients(patients); }
Example #3
Source File: TextEncodedTelemetryReaderTest.java From metron with Apache License 2.0 | 6 votes |
@Test public void testCSV() { // re-write the test data as a CSV with a header record String pathToCSV = tempFolder.getRoot().getAbsolutePath(); spark.read() .format("text") .load("src/test/resources/telemetry.json") .as(Encoders.STRING()) .write() .mode("overwrite") .option("header", "true") .format("csv") .save(pathToCSV); // tell the profiler to use the CSV input data profilerProperties.put(TELEMETRY_INPUT_PATH.getKey(), pathToCSV); profilerProperties.put(TELEMETRY_INPUT_FORMAT.getKey(), "csv"); // set a reader property; tell the reader to expect a header readerProperties.put("header", "true"); // there should be 100 valid JSON records Dataset<String> telemetry = TelemetryReaders.TEXT.read(spark, profilerProperties, readerProperties); assertEquals(100, telemetry.filter(new IsValidJSON()).count()); }
Example #4
Source File: BundlesTest.java From bunsen with Apache License 2.0 | 6 votes |
@Test public void testXmlBundleStrings() { JavaRDD<String> xmlBundlesRdd = spark.sparkContext() .wholeTextFiles("src/test/resources/xml/bundles", 1) .toJavaRDD() .map(tuple -> tuple._2()); Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(), Encoders.STRING()); xmlBundles.write().saveAsTable("xml_bundle_table"); JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml( spark.sql("select value from xml_bundle_table"), "value"); Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark, bundles, Patient.class); checkPatients(patients); }
Example #5
Source File: MutationToStructureDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local[*]").appName(MutationToStructureDemo.class.getSimpleName()) .getOrCreate(); // find missense mutations that map to UniProt ID P15056 (BRAF) // that are annotated as pathogenic or likely pathogenic in ClinVar. List<String> uniprotIds = Arrays.asList("P15056"); // BRAF: P15056 String query = "clinvar.rcv.clinical_significance:pathogenic OR clinvar.rcv.clinical_significance:likely pathogenic"; Dataset<Row> df = MyVariantDataset.getVariations(uniprotIds, query).cache(); System.out.println("BRAF missense mutations: " + df.count()); df.show(); // extract the list of variant Ids List<String> variantIds = df.select("variationId").as(Encoders.STRING()).collectAsList(); // map to PDB structures Dataset<Row> ds = G2SDataset.getPositionDataset(variantIds); ds = ds.sort("structureId","chainId","pdbPosition"); ds.show(); spark.close(); }
Example #6
Source File: TestSQLDeriver.java From envelope with Apache License 2.0 | 6 votes |
@Test public void testQueryLiteral() throws Exception { Contexts.getSparkSession().createDataset(Lists.newArrayList(1), Encoders.INT()).createOrReplaceTempView("literaltable"); Map<String, Object> configMap = Maps.newHashMap(); configMap.put(SQLDeriver.QUERY_LITERAL_CONFIG_NAME, "SELECT * FROM literaltable"); Config config = ConfigFactory.parseMap(configMap); SQLDeriver deriver = new SQLDeriver(); assertNoValidationFailures(deriver, config); deriver.configure(config); Object result = deriver.derive(Maps.<String, Dataset<Row>>newHashMap()).collectAsList().get(0).get(0); assertEquals(1, result); }
Example #7
Source File: CsvToDatasetBookAsJson.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "CSV to Dataset<Book> as JSON").master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING()); bookDf.show(20, 132); Dataset<Row> bookAsJsonDf = spark.read().json(bookDf); bookAsJsonDf.show(); }
Example #8
Source File: PdbToUniProt.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Returns an up-to-date dataset of PDB to UniProt * chain-level mappings for a list of ids. * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainIds (e.g., 1XYZ.A). * * @param ids list of pdbIds or pdbId.chainIds * @return dataset of PDB to UniProt chain-level mappings * @throws IOException */ public static Dataset<Row> getChainMappings(List<String> ids) throws IOException { SparkSession spark = SparkSession.builder().getOrCreate(); // get a dataset of up-to-date UniProt chain mappings Dataset<Row> ds = getChainMappings(); // create a dataset of ids from the passed-in list Dataset<Row> subset = spark.createDataset(ids, Encoders.STRING()).toDF("id"); // create subsets of data if (!ids.isEmpty()) { if (ids.get(0).length() == 4) { // join by pdbId ds = ds.join(subset, ds.col("structureId").equalTo(subset.col("id"))).drop("id"); } else { // join by pdbChainId ds = ds.join(subset, ds.col("structureChainId").equalTo(subset.col("id"))).drop("id"); } } return ds; }
Example #9
Source File: StructuredNodeLoader.java From sylph with Apache License 2.0 | 6 votes |
private static TransForm<Dataset<Row>> loadRealTimeTransForm(RealTimeTransForm realTimeTransForm) { return stream -> { //spark2.x 要对dataSet 进行map操作必须要加上下面一句类型映射 即必须要指明返回的schema //implicit val matchError:org.apache.spark.sql.Encoder[Row] = org.apache.spark.sql.Encoders.kryo[Row] // import collection.JavaConverters._ // val mapRowSchema = realTimeTransForm.getRowSchema.getFields.asScala.map(filed => { // StructField(filed.getName, SparkRow.SparkRowParser.parserType(filed.getJavaType), true) // }) // RowEncoder.apply(StructType(mapRowSchema)) //implicit val mapenc = RowEncoder.apply(rddSchema) //此处无法注册 原因是必须是sql基本类型 //Encoders.STRING Dataset<Row> transStream = stream.mapPartitions( (MapPartitionsFunction<Row, Row>) partition -> StreamNodeLoader.transFunction(partition, realTimeTransForm), Encoders.kryo(Row.class)); //或者使用 transStream.as() return transStream; }; }
Example #10
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 6 votes |
@Test public synchronized void testTablesSupport() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "table"); createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "1"), new SimpleRecord(2, "2"), new SimpleRecord(3, "3")); Dataset<Row> inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); inputDf.select("id", "data").write() .format("iceberg") .mode(SaveMode.Append) .save(loadLocation(tableIdentifier)); Dataset<Row> resultDf = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier)); List<SimpleRecord> actualRecords = resultDf.orderBy("id") .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); Assert.assertEquals("Records should match", expectedRecords, actualRecords); }
Example #11
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private Dataset<Row> buildActualFileDF() { List<String> subDirs = Lists.newArrayList(); List<String> matchingFiles = Lists.newArrayList(); Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles); JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1); if (subDirs.isEmpty()) { return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); } int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism); JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism); Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf); JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); }
Example #12
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
RewriteManifestsAction(SparkSession spark, Table table) { this.spark = spark; this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( table.properties(), TableProperties.MANIFEST_TARGET_SIZE_BYTES, TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location TableOperations ops = ((HasTableOperations) table).operations(); Path metadataFilePath = new Path(ops.metadataFileLocation("file")); this.stagingLocation = metadataFilePath.getParent().toString(); // use the current table format version for new manifests this.formatVersion = ops.current().formatVersion(); }
Example #13
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
@Override public List<String> execute() { Dataset<Row> validDataFileDF = buildValidDataFileDF(); Dataset<Row> validMetadataFileDF = buildValidMetadataFileDF(); Dataset<Row> validFileDF = validDataFileDF.union(validMetadataFileDF); Dataset<Row> actualFileDF = buildActualFileDF(); Column nameEqual = filename.apply(actualFileDF.col("file_path")) .equalTo(filename.apply(validFileDF.col("file_path"))); Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path")); Column joinCond = nameEqual.and(actualContains); List<String> orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti") .as(Encoders.STRING()) .collectAsList(); Tasks.foreach(orphanFiles) .noRetry() .suppressFailureWhenFinished() .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)) .run(deleteFunc::accept); return orphanFiles; }
Example #14
Source File: BatchProfilerIntegrationTest.java From metron with Apache License 2.0 | 5 votes |
@Test public void testBatchProfilerWithCSV() throws Exception { // re-write the test data as a CSV with a header record String pathToCSV = tempFolder.getRoot().getAbsolutePath(); spark.read() .format("text") .load("src/test/resources/telemetry.json") .as(Encoders.STRING()) .write() .mode("overwrite") .option("header", "true") .format("csv") .save(pathToCSV); // tell the profiler to use the CSV input data // CSV is an example of needing to define both the reader and the input format profilerProperties.put(TELEMETRY_INPUT_PATH.getKey(), pathToCSV); profilerProperties.put(TELEMETRY_INPUT_READER.getKey(), "text"); profilerProperties.put(TELEMETRY_INPUT_FORMAT.getKey(), "csv"); // set a reader property; tell the reader to expect a header readerProperties.put("header", "true"); BatchProfiler profiler = new BatchProfiler(); profiler.run(spark, profilerProperties, getGlobals(), readerProperties, fromJSON(profileJson)); validateProfiles(); }
Example #15
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedOverwrite() throws IOException { File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); // overwrite with the same data; should not produce two copies df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("overwrite") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example #16
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
Example #17
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedStartsWith() { Dataset<Row> df = spark.read() .format("iceberg") .load(unpartitioned.toString()); List<String> matchedData = df.select("data") .where("data LIKE 'jun%'") .as(Encoders.STRING()) .collectAsList(); Assert.assertEquals(1, matchedData.size()); Assert.assertEquals("junction", matchedData.get(0)); }
Example #18
Source File: AbstractJavaEsSparkStructuredStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void test1WriteWithMappingId() throws Exception { String target = wrapIndex(resource("test-write-id", "data")); String docPath = wrapIndex(docPath("test-write-id", "data")); JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class)); RecordBean doc1 = new RecordBean(); doc1.setId(1); doc1.setName("Spark"); RecordBean doc2 = new RecordBean(); doc2.setId(2); doc2.setName("Hadoop"); RecordBean doc3 = new RecordBean(); doc3.setId(3); doc3.setName("YARN"); Dataset<RecordBean> dataset = test .withInput(doc1) .withInput(doc2) .withInput(doc3) .stream(); test.run( dataset.writeStream() .option("checkpointLocation", checkpoint(target)) .option("es.mapping.id", "id") .format("es"), target ); assertEquals(3, JavaEsSpark.esRDD(new JavaSparkContext(spark.sparkContext()), target).count()); assertTrue(RestUtils.exists(docPath + "/1")); assertTrue(RestUtils.exists(docPath + "/2")); assertTrue(RestUtils.exists(docPath + "/3")); assertThat(RestUtils.get(target + "/_search?"), containsString("Spark")); }
Example #19
Source File: TextEncodedTelemetryReader.java From metron with Apache License 2.0 | 5 votes |
@Override public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) { String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class); if(inputFormat == null) { inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class); } LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat); return spark .read() .options(Maps.fromProperties(readerProps)) .format(inputFormat) .load(inputPath) .as(Encoders.STRING()); }
Example #20
Source File: AbstractJavaEsSparkStructuredStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void test1BasicWrite() throws Exception { String target = wrapIndex(resource("test-write", "data")); JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class)); RecordBean doc1 = new RecordBean(); doc1.setId(1); doc1.setName("Spark"); RecordBean doc2 = new RecordBean(); doc2.setId(2); doc2.setName("Hadoop"); RecordBean doc3 = new RecordBean(); doc3.setId(3); doc3.setName("YARN"); Dataset<RecordBean> dataset = test .withInput(doc1) .withInput(doc2) .withInput(doc3) .stream(); test.run( dataset.writeStream() .option("checkpointLocation", checkpoint(target)) .format("es"), target ); assertTrue(RestUtils.exists(target)); assertThat(RestUtils.get(target + "/_search?"), containsString("Spark")); assertThat(RestUtils.get(target + "/_search?"), containsString("Hadoop")); assertThat(RestUtils.get(target + "/_search?"), containsString("YARN")); }
Example #21
Source File: TestOrcWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBasicWrite() throws IOException { File parent = temp.newFolder("orc"); File location = new File(parent, "test"); location.mkdirs(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); table.updateProperties() .defaultFormat(FileFormat.ORC) .set(OrcConf.COMPRESS.getAttribute(), CompressionKind.NONE.name()) .commit(); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema df.select("id", "data").write() .format("iceberg") .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as( Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example #22
Source File: TestPartitionValues.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testReorderedColumns() throws Exception { String desc = "reorder_columns"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("data", "id").write() .format("iceberg") .mode("append") .option("check-ordering", "false") .save(location.toString()); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result .orderBy("id") .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example #23
Source File: DataSetApplication.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Person person = new Person("spark",10); Encoder<Person> encoder = Encoders.bean(Person.class); Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder); dataset.show(); //最终输出 {name:spark;age:10} /*常见类型的编码器*/ Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder); Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) { return value+1; } },integerEncoder); result.collect(); //最终输出 [2,3] /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/ String url = "/usr/local/text.json"; Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder); personDataset.show(); //最终输出 name:... age:,,,, }
Example #24
Source File: TestPartitionValues.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testNullPartitionValue() throws Exception { String desc = "null_part"; File parent = temp.newFolder(desc); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString()); table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, null) ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .mode("append") .save(location.toString()); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result .orderBy("id") .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example #25
Source File: AbstractJavaEsSparkStructuredStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test(expected = EsHadoopIllegalArgumentException.class) public void test0FailOnIndexCreationDisabled() throws Exception { String target = wrapIndex(resource("test-nonexisting", "data")); JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class)); RecordBean doc1 = new RecordBean(); doc1.setId(1); doc1.setName("Spark"); RecordBean doc2 = new RecordBean(); doc2.setId(2); doc2.setName("Hadoop"); RecordBean doc3 = new RecordBean(); doc3.setId(3); doc3.setName("YARN"); Dataset<RecordBean> dataset = test .withInput(doc1) .withInput(doc2) .withInput(doc3) .expectingToThrow(EsHadoopIllegalArgumentException.class) .stream(); test.run( dataset.writeStream() .option("checkpointLocation", checkpoint(target)) .option(ES_INDEX_AUTO_CREATE, "no") .format("es"), target ); assertTrue(!RestUtils.exists(target)); }
Example #26
Source File: TestDataSourceOptions.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testHadoopOptions() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); String originalDefaultFS = sparkHadoopConf.get("fs.default.name"); try { HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); tables.create(SCHEMA, spec, options, tableLocation); // set an invalid value for 'fs.default.name' in Spark Hadoop config // to verify that 'hadoop.' data source options are propagated correctly sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000"); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .option("hadoop.fs.default.name", "file:///") .save(tableLocation); Dataset<Row> resultDf = spark.read() .format("iceberg") .option("hadoop.fs.default.name", "file:///") .load(tableLocation); List<SimpleRecord> resultRecords = resultDf.orderBy("id") .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); Assert.assertEquals("Records should match", expectedRecords, resultRecords); } finally { sparkHadoopConf.set("fs.default.name", originalDefaultFS); } }
Example #27
Source File: Functions.java From bunsen with Apache License 2.0 | 5 votes |
/** * Converts a set of FHIR resources to JSON. * * @param dataset a dataset containing FHIR resources * @param resourceType the FHIR resource type * @return a dataset of JSON strings for the FHIR resources */ public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) { Dataset<IBaseResource> resourceDataset = dataset.as(FhirEncoders.forR4() .getOrCreate() .of(resourceType)); return resourceDataset.map(new ToJson(), Encoders.STRING()); }
Example #28
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
private Dataset<Row> buildValidMetadataFileDF() { String allManifestsMetadataTable = metadataTableName(MetadataTableType.ALL_MANIFESTS); Dataset<Row> manifestDF = spark.read().format("iceberg") .load(allManifestsMetadataTable) .selectExpr("path as file_path"); List<String> otherMetadataFiles = Lists.newArrayList(); for (Snapshot snapshot : table.snapshots()) { String manifestListLocation = snapshot.manifestListLocation(); if (manifestListLocation != null) { otherMetadataFiles.add(manifestListLocation); } } otherMetadataFiles.add(ops.metadataFileLocation("version-hint.text")); TableMetadata metadata = ops.current(); otherMetadataFiles.add(metadata.metadataFileLocation()); for (TableMetadata.MetadataLogEntry previousMetadataFile : metadata.previousFiles()) { otherMetadataFiles.add(previousMetadataFile.file()); } Dataset<Row> otherMetadataFileDF = spark .createDataset(otherMetadataFiles, Encoders.STRING()) .toDF("file_path"); return manifestDF.union(otherMetadataFileDF); }
Example #29
Source File: JavaUserDefinedTypedAggregation.java From nemo with Apache License 2.0 | 5 votes |
/** * Main function. * @param args arguments. */ public static void main(final String[] args) { SparkSession spark = SparkSession .builder() .appName("Java Spark SQL user-defined Datasets aggregation example") .getOrCreate(); Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class); String path = args[0]; Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder); ds.show(); // +-------+------+ // | name|salary| // +-------+------+ // |Michael| 3000| // | Andy| 4500| // | Justin| 3500| // | Berta| 4000| // +-------+------+ MyAverage myAverage = new MyAverage(); // Convert the function to a `TypedColumn` and give it a name TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary"); Dataset<Double> result = ds.select(averageSalary); result.show(); // +--------------+ // |average_salary| // +--------------+ // | 3750.0| // +--------------+ spark.stop(); }
Example #30
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testWriteProjection() throws IOException { Assume.assumeTrue( "Not supported in Spark 3.0; analysis requires all columns are present", spark.version().startsWith("2")); File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null) ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id").write() // select only id column .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }