org.apache.spark.sql.SQLContext Java Examples
The following examples show how to use
org.apache.spark.sql.SQLContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieSnapshotExporter.java From hudi with Apache License 2.0 | 7 votes |
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) { Partitioner defaultPartitioner = dataset -> { Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq()); return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField); }; Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner); final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg); Iterator<String> exportingFilePaths = jsc .parallelize(partitions, partitions.size()) .flatMap(partition -> fsView .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp) .map(HoodieBaseFile::getPath).iterator()) .toLocalIterator(); Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq()); partitioner.partition(sourceDataset) .format(cfg.outputFormat) .mode(SaveMode.Overwrite) .save(cfg.targetOutputPath); }
Example #2
Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0 | 6 votes |
@Test public void createHDFSRDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<String> rdd = mock(RDD.class); JavaRDD<String> javaRdd = mock(JavaRDD.class); when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd); doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString()); when(rdd.toJavaRDD()).thenReturn(javaRdd); when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd); ExtractorConfig<Cells> config = createHDFSDeepJobConfig(); RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config); verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt()); verify(javaRdd, times(1)).map(any(Function.class)); }
Example #3
Source File: NGramBuilder.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Creates a n-gram data frame from text lines. * @param lines * @return a n-gram data frame. */ DataFrame createNGramDataFrame(JavaRDD<String> lines) { JavaRDD<Row> rows = lines.map(new Function<String, Row>(){ private static final long serialVersionUID = -4332903997027358601L; @Override public Row call(String line) throws Exception { return RowFactory.create(Arrays.asList(line.split("\\s+"))); } }); StructType schema = new StructType(new StructField[] { new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema); // build a bigram language model NGram transformer = new NGram().setInputCol("words") .setOutputCol("ngrams").setN(2); DataFrame ngramDF = transformer.transform(wordDF); ngramDF.show(10, false); return ngramDF; }
Example #4
Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0 | 6 votes |
@Test public void textFileS3Test() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<Cells> result = mock(RDD.class); ExtractorConfig<Cells> config = createS3DeepJobConfig(); PowerMockito.doReturn(result).when(deepSparkContextSpy).createS3RDD(config); deepSparkContextSpy.textFile(config); verify(deepSparkContextSpy, times(1)).createS3RDD(config); }
Example #5
Source File: SubStringCounterDataSource.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
@Override public BaseRelation createRelation(SQLContext arg0, Map<String, String> arg1) { log.debug("-> createRelation()"); java.util.Map<String, String> javaMap = scala.collection.JavaConverters .mapAsJavaMapConverter(arg1).asJava(); SubStringCounterRelation br = new SubStringCounterRelation(); br.setSqlContext(arg0); for (java.util.Map.Entry<String, String> entry : javaMap.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); log.debug("[{}] --> [{}]", key, value); if (key.compareTo(K.PATH) == 0) { br.setFilename(value); } else if (key.startsWith(K.COUNT)) { br.addCriteria(value); } } return br; }
Example #6
Source File: Tagger.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Tags a list of sequences and returns a list of tag sequences. * @param sentences * @return a list of tagged sequences. */ public List<String> tag(List<String> sentences) { List<Row> rows = new LinkedList<Row>(); for (String sentence : sentences) { rows.add(RowFactory.create(sentence)); } StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(jsc); DataFrame input = sqlContext.createDataFrame(rows, schema); if (cmmModel != null) { DataFrame output = cmmModel.transform(input).repartition(1); return output.javaRDD().map(new RowToStringFunction(1)).collect(); } else { System.err.println("Tagging model is null. You need to create or load a model first."); return null; } }
Example #7
Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0 | 6 votes |
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String instantTime) { HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); if (!commitTimeline.containsInstant(commitInstant)) { throw new HoodieException("No commit exists at " + instantTime); } try { HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); LOG.info("Path :" + paths.values()); return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime)); } catch (Exception e) { throw new HoodieException("Error reading commit " + instantTime, e); } }
Example #8
Source File: DataStreamLoaderExample.java From toolbox with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { SparkConf conf = new SparkConf().setAppName("SLink!").setMaster("local"); SparkContext sc = new SparkContext(conf); SQLContext sqlContext = new SQLContext(sc); //Path to dataset String path ="datasets/simulated/WI_samples.json"; //Create an AMIDST object for managing the data DataSpark dataSpark = DataSparkLoader.open(sqlContext, path); //Print all the instances in the dataset dataSpark.collectDataStream() .forEach( dataInstance -> System.out.println(dataInstance) ); }
Example #9
Source File: BigQuerySparkSQL.java From spark-on-k8s-gcp-examples with Apache License 2.0 | 6 votes |
private static BigQuerySQLContext createBigQuerySQLContext(String[] args) { String projectId = args[0]; Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId), "GCP project ID must not be empty"); String gcsBucket = args[1]; Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket), "GCS bucket must not be empty"); String serviceAccountJsonKeyFilePath = System.getenv(APPLICATION_CREDENTIALS_ENV); Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath), APPLICATION_CREDENTIALS_ENV + " must be set"); SQLContext sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate()); BigQuerySQLContext bigQuerySQLContext = new BigQuerySQLContext(sqlContext); bigQuerySQLContext.setBigQueryProjectId(projectId); bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket); bigQuerySQLContext.setGcpJsonKeyFile(serviceAccountJsonKeyFilePath); return bigQuerySQLContext; }
Example #10
Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0 | 6 votes |
/** * Obtain all new data written into the Hoodie table since the given timestamp. */ public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String lastCommitTime) { List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]); Dataset<Row> rows = null; if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { rows = sqlContext.read().parquet(paths); } return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); } catch (IOException e) { throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); } }
Example #11
Source File: NeedingHelpGoPackageFinder.java From spark-on-k8s-gcp-examples with Apache License 2.0 | 6 votes |
private NeedingHelpGoPackageFinder( String projectId, String bigQueryDataset, String gcsBucket, boolean useSampleTables) { Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId), "GCP project ID must not be empty"); Preconditions.checkArgument(!Strings.isNullOrEmpty(bigQueryDataset), "BigQuery dataset name must not be empty"); Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket), "GCS bucket must not be empty"); this.projectId = projectId; this.bigQueryDataset = bigQueryDataset; this.sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate()); this.bigQuerySQLContext = new BigQuerySQLContext(this.sqlContext); this.bigQuerySQLContext.setBigQueryProjectId(projectId); this.bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket); this.useSampleTables = useSampleTables; }
Example #12
Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0 | 6 votes |
/** * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class. * Each expression is run on the provided HiveContext. * * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions */ @Override public void runScript(SQLContext sqlContext) { String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script); for (String expression : expressions) { sqlContext.sql(expression); } }
Example #13
Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0 | 6 votes |
@Test(expected = UnsupportedOperationException.class) public void createJavaSchemaFromEmptyRDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); ExtractorConfig config = createDeepJobConfig(); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config); JavaRDD<Row> rowRDD = mock(JavaRDD.class); mockStatic(DeepSparkContext.class); when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD); when(singleRdd.first()).thenThrow(new UnsupportedOperationException()); deepSparkContextSpy.createJavaSchemaRDD(config); }
Example #14
Source File: Tagger.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Tags a list of sequences and writes the result to an output file with a * desired output format. * * @param sentences * @param outputFileName * @param outputFormat */ public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) { List<Row> rows = new LinkedList<Row>(); for (String sentence : sentences) { rows.add(RowFactory.create(sentence)); } StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(jsc); DataFrame input = sqlContext.createDataFrame(rows, schema); tag(input, outputFileName, outputFormat); }
Example #15
Source File: SparkSqlInterpreter.java From Explorer with Apache License 2.0 | 5 votes |
public int getProgress() { SQLContext sqlc = getSparkInterpreter().getSQLContext(); SparkContext sc = sqlc.sparkContext(); JobProgressListener sparkListener = getSparkInterpreter().getJobProgressListener(); int completedTasks = 0; int totalTasks = 0; DAGScheduler scheduler = sc.dagScheduler(); HashSet<ActiveJob> jobs = scheduler.activeJobs(); Iterator<ActiveJob> it = jobs.iterator(); while (it.hasNext()) { ActiveJob job = it.next(); String g = (String) job.properties().get("spark.jobGroup.id"); if (jobGroup.equals(g)) { int[] progressInfo = null; if (sc.version().startsWith("1.0")) { progressInfo = getProgressFromStage_1_0x(sparkListener, job.finalStage()); } else if (sc.version().startsWith("1.1") || sc.version().startsWith("1.2")) { progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage()); } else { logger.warn("Spark {} getting progress information not supported" + sc.version()); continue; } totalTasks += progressInfo[0]; completedTasks += progressInfo[1]; } } if (totalTasks == 0) { return 0; } return completedTasks * 100 / totalTasks; }
Example #16
Source File: Tagger.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Tags a distributed list of sentences and writes the result to an output file with * a desired output format. * @param sentences * @param outputFileName * @param outputFormat */ public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) { StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(jsc); DataFrame input = sqlContext.createDataFrame(sentences, schema); tag(input, outputFileName, outputFormat); }
Example #17
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * @param sc spark context to be used * @param persistor output persistor * @param rdfSchema schema storing information about classes and properties */ public MetadataWriter(JavaSparkContext sc, Persistor persistor, RdfSchema rdfSchema) { this.sql = new SQLContext(sc); this.persistor = persistor; this.rdfSchema = rdfSchema; this.stats = new ArrayList<>(); }
Example #18
Source File: Tagger.java From vn.vitk with GNU General Public License v3.0 | 5 votes |
/** * Creates a data frame from a list of tagged sentences. * @param taggedSentences * @return a data frame of two columns: "sentence" and "partOfSpeech". */ public DataFrame createDataFrame(List<String> taggedSentences) { List<String> wordSequences = new LinkedList<String>(); List<String> tagSequences = new LinkedList<String>(); for (String taggedSentence : taggedSentences) { StringBuilder wordBuf = new StringBuilder(); StringBuilder tagBuf = new StringBuilder(); String[] tokens = taggedSentence.split("\\s+"); for (String token : tokens) { String[] parts = token.split("/"); if (parts.length == 2) { wordBuf.append(parts[0]); wordBuf.append(' '); tagBuf.append(parts[1]); tagBuf.append(' '); } else { // this token is "///" wordBuf.append('/'); wordBuf.append(' '); tagBuf.append('/'); tagBuf.append(' '); } } wordSequences.add(wordBuf.toString().trim()); tagSequences.add(tagBuf.toString().trim()); } if (verbose) { System.out.println("Number of sentences = " + wordSequences.size()); } List<Row> rows = new LinkedList<Row>(); for (int i = 0; i < wordSequences.size(); i++) { rows.add(RowFactory.create(wordSequences.get(i), tagSequences.get(i))); } JavaRDD<Row> jrdd = jsc.parallelize(rows); StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()), new StructField("partOfSpeech", DataTypes.StringType, false, Metadata.empty()) }); return new SQLContext(jsc).createDataFrame(jrdd, schema); }
Example #19
Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0 | 5 votes |
/** * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class. * Each expression is run on the provided HiveContext. * * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions * @return the row results acquired from the last executed expression */ @Override public List<Row> runScriptReturnResults(SQLContext sqlContext) { String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script); for (int i = 0; i < expressions.length - 1; i++) { String expression = expressions[i]; sqlContext.sql(expression); } List<Row> rows = sqlContext.sql(expressions[expressions.length - 1]).collectAsList(); return rows; }
Example #20
Source File: RelationExtractorTest.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Test if expected directed relations are collected from a RDD of Instances */ @Test public void testCollectRelations() { SQLContext sql = new SQLContext(jsc()); RelationExtractor collector = new RelationExtractor( new RelationConfig(), jsc(), new ClassGraph() ); List<Row> rdd = new ArrayList<>(); // cycle one -> two -> three -> one rdd.add(RowFactory.create(0, 1, 1L, 1, 2L)); rdd.add(RowFactory.create(0, 1, 2L, 1, 3L)); rdd.add(RowFactory.create(0, 1, 3L, 1, 1L)); // one -> four, four -> one rdd.add(RowFactory.create(0, 2, 4L, 1, 1L)); rdd.add(RowFactory.create(0, 1, 1L, 2, 4L)); // five -> one rdd.add(RowFactory.create(0, 3, 5L, 1, 1L)); DataFrame expected = sql.createDataFrame(rdd, new StructType() .add("predicateIndex", DataTypes.IntegerType, false) .add("fromTypeIndex", DataTypes.IntegerType, false) .add("fromID", DataTypes.LongType, false) .add("toTypeIndex", DataTypes.IntegerType, false) .add("toID", DataTypes.LongType, false) ); // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID) DataFrame result = collector.extractRelations(getTestRDD()); assertEquals("Expected relation row schema is collected", expected.schema(), result.schema()); assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD()); }
Example #21
Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0 | 5 votes |
@Before public void setUp() { sql = new SQLContext(jsc()); uriIndex = new IndexMap<>(Arrays.asList( "http://example.com/a", "http://example.com/b", "http://example.com/c", "http://example.com/knows", "http://example.com/likes", "http://example.com/name", "http://example.com/age" )); rdfSchema = new RdfSchema( new RdfSchemaCollectorConfig(), new ClassGraph(), uriIndex, uriIndex, null ); typeNames = new HashMap<>(); typeNames.put("http://example.com/a", "a"); typeNames.put("http://example.com/b", "b"); typeNames.put("http://example.com/c", "c"); config = new InstanceRelationWriterConfig(); persistor = new DataFrameMapPersistor(); result = persistor.getResultMap(); }
Example #22
Source File: TestPerformanceRegression.java From chronix.spark with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws SolrServerException, IOException { ChronixSparkLoader loader = new ChronixSparkLoader(); ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext(); SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext()); // BENCHMARK START ............................... long start = System.currentTimeMillis(); for (int i = 0; i < LOOPS; i++) { //Load data into ChronixRDD ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext); //Some actions double mean = rdd.mean(); double approxMean = rdd.approxMean(); long observationCount = rdd.countObservations(); double max = rdd.max(); double min = rdd.min(); Iterator<MetricTimeSeries> it = rdd.iterator(); while (it.hasNext()) { MetricTimeSeries mts = it.next(); System.out.print("."); } //DataFrame operations Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext); ds.count(); } long stop = System.currentTimeMillis(); // BENCHMARK STOP ................................... System.out.println("\nBenchmark duration: " + (stop - start) + " ms"); chronixSparkContext.getSparkContext().close(); }
Example #23
Source File: TestHDFSParquetImporter.java From hudi with Apache License 2.0 | 5 votes |
/** * Test successful insert and verify data consistency. */ @Test public void testImportWithInsert() throws IOException, ParseException { try (JavaSparkContext jsc = getJavaSparkContext()) { insert(jsc); SQLContext sqlContext = new SQLContext(jsc); Dataset<Row> ds = HoodieClientTestUtils.read(jsc, basePath + "/testTarget", sqlContext, dfs, basePath + "/testTarget/*/*/*/*"); List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList(); List<HoodieTripModel> result = readData.stream().map(row -> new HoodieTripModel(row.getDouble(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4), row.getDouble(5), row.getDouble(6), row.getDouble(7))) .collect(Collectors.toList()); List<HoodieTripModel> expected = insertData.stream().map(g -> new HoodieTripModel(Double.parseDouble(g.get("timestamp").toString()), g.get("_row_key").toString(), g.get("rider").toString(), g.get("driver").toString(), Double.parseDouble(g.get("begin_lat").toString()), Double.parseDouble(g.get("begin_lon").toString()), Double.parseDouble(g.get("end_lat").toString()), Double.parseDouble(g.get("end_lon").toString()))) .collect(Collectors.toList()); assertTrue(result.containsAll(expected) && expected.containsAll(result) && result.size() == expected.size()); } }
Example #24
Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0 | 5 votes |
@Test public void textFileHDFSTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<Cells> result = mock(RDD.class); ExtractorConfig<Cells> config = createHDFSDeepJobConfig(); PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config); deepSparkContextSpy.textFile(config); verify(deepSparkContextSpy, times(1)).createHDFSRDD(config); }
Example #25
Source File: SaveModelDemo.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // 创建DataFrame 读取json SQLContext sqlContext = new SQLContext(sc); Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出 dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加 dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误 dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖 sc.close(); }
Example #26
Source File: BatchProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Properties prop = PropertyFileReader.readPropertyFile("iot-spark.properties"); String file = prop.getProperty("com.iot.app.hdfs") + "iot-data-parque"; String[] jars = {prop.getProperty("com.iot.app.jar")}; JavaSparkContext sparkContext = getSparkContext(prop, jars); SQLContext sqlContext = new SQLContext(sparkContext); Dataset<Row> dataFrame = getDataFrame(sqlContext, file); JavaRDD<IoTData> rdd = dataFrame.javaRDD().map(getRowIoTDataFunction()); BatchHeatMapProcessor processor = new BatchHeatMapProcessor(); processor.processHeatMap(rdd); sparkContext.close(); sparkContext.stop(); }
Example #27
Source File: DataSparkLoader.java From toolbox with Apache License 2.0 | 5 votes |
public static DataSpark open(SQLContext sqlContext, String path, String formatFile) throws Exception { //Load the data and store it into an object of class DataFrame DataFrame df = sqlContext.read().format(formatFile).load(path); //Create an AMIDST object for managing the data return loadSparkDataFrame(df); }
Example #28
Source File: UserVisitAnalyze.java From UserActionAnalyzePlatform with Apache License 2.0 | 5 votes |
private static void mock(JavaSparkContext context,SQLContext sc) { boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL); if(local) { MockData.mock(context,sc); } }
Example #29
Source File: UserVisitAnalyze.java From UserActionAnalyzePlatform with Apache License 2.0 | 5 votes |
/** * 用于判断是否是生产环境 * @param sc * @return */ public static SQLContext getSQLContext(SparkContext sc) { boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL); if(local) { return new SQLContext(sc); } return new HiveContext(sc); }
Example #30
Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0 | 5 votes |
/** * 查询指定日期范围内的点击行为数据 * @param sqlContext * @param startDate 起始日期 * @param endDate 截止日期 * @return 点击行为数据 */ private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate( SQLContext sqlContext, String startDate, String endDate) { // 从user_visit_action中,查询用户访问行为数据 // 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为 // 第二个限定:在用户指定的日期范围内的数据 String sql = "SELECT " + "city_id," + "click_product_id product_id " + "FROM user_visit_action " + "WHERE click_product_id IS NOT NULL " + "AND day>='" + startDate + "' " + "AND day<='" + endDate + "'"; Dataset<Row> clickActionDF = sqlContext.sql(sql); JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD(); JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair( new PairFunction<Row, Long, Row>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, Row> call(Row row) throws Exception { Long cityid = row.getLong(0); return new Tuple2<Long, Row>(cityid, row); } }); return cityid2clickActionRDD; }