org.apache.spark.sql.SQLContext Java Exaples

Source File: HoodieSnapshotExporter.java From hudi with Apache License 2.0

7 votes

private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

6 votes

@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}

Source File: NGramBuilder.java From vn.vitk with GNU General Public License v3.0

6 votes

/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

6 votes

@Test
public void textFileS3Test() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createS3DeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createS3RDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createS3RDD(config);

}

Source File: SubStringCounterDataSource.java From net.jgp.labs.spark with Apache License 2.0

6 votes

@Override
public BaseRelation createRelation(SQLContext arg0, Map<String,
    String> arg1) {
  log.debug("-> createRelation()");

  java.util.Map<String, String> javaMap = scala.collection.JavaConverters
      .mapAsJavaMapConverter(arg1).asJava();

  SubStringCounterRelation br = new SubStringCounterRelation();
  br.setSqlContext(arg0);

  for (java.util.Map.Entry<String, String> entry : javaMap.entrySet()) {
    String key = entry.getKey();
    String value = entry.getValue();
    log.debug("[{}] --> [{}]", key, value);
    if (key.compareTo(K.PATH) == 0) {
      br.setFilename(value);
    } else if (key.startsWith(K.COUNT)) {
      br.addCriteria(value);
    }
  }

  return br;
}

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

6 votes

/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}

Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0

6 votes

public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
                                      String instantTime) {
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);
  if (!commitTimeline.containsInstant(commitInstant)) {
    throw new HoodieException("No commit exists at " + instantTime);
  }
  try {
    HashMap<String, String> paths =
        getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
    LOG.info("Path :" + paths.values());
    return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
        .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
  } catch (Exception e) {
    throw new HoodieException("Error reading commit " + instantTime, e);
  }
}

Source File: DataStreamLoaderExample.java From toolbox with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
      SparkConf conf = new SparkConf().setAppName("SLink!").setMaster("local");
      SparkContext sc = new SparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);

      //Path to dataset
      String path ="datasets/simulated/WI_samples.json";

//Create an AMIDST object for managing the data
      DataSpark dataSpark = DataSparkLoader.open(sqlContext, path);


//Print all the instances in the dataset
      dataSpark.collectDataStream()
              .forEach(
                      dataInstance -> System.out.println(dataInstance)
              );


  }

Source File: BigQuerySparkSQL.java From spark-on-k8s-gcp-examples with Apache License 2.0

6 votes

private static BigQuerySQLContext createBigQuerySQLContext(String[] args) {
  String projectId = args[0];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  String gcsBucket = args[1];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  String serviceAccountJsonKeyFilePath = System.getenv(APPLICATION_CREDENTIALS_ENV);
  Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath),
      APPLICATION_CREDENTIALS_ENV + " must be set");

  SQLContext sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  BigQuerySQLContext bigQuerySQLContext = new BigQuerySQLContext(sqlContext);
  bigQuerySQLContext.setBigQueryProjectId(projectId);
  bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);
  bigQuerySQLContext.setGcpJsonKeyFile(serviceAccountJsonKeyFilePath);

  return bigQuerySQLContext;
}

Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0

6 votes

/**
 * Obtain all new data written into the Hoodie table since the given timestamp.
 */
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext,
                                     HoodieTimeline commitTimeline, String lastCommitTime) {
  List<HoodieInstant> commitsToReturn =
      commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
  try {
    // Go over the commit metadata, and obtain the new files that need to be read.
    HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
    String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]);
    Dataset<Row> rows = null;
    if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
      rows = sqlContext.read().parquet(paths);
    }

    return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
  } catch (IOException e) {
    throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
  }
}

Source File: NeedingHelpGoPackageFinder.java From spark-on-k8s-gcp-examples with Apache License 2.0

6 votes

private NeedingHelpGoPackageFinder(
    String projectId,
    String bigQueryDataset,
    String gcsBucket,
    boolean useSampleTables) {
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(bigQueryDataset),
      "BigQuery dataset name must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  this.projectId = projectId;
  this.bigQueryDataset = bigQueryDataset;

  this.sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  this.bigQuerySQLContext = new BigQuerySQLContext(this.sqlContext);
  this.bigQuerySQLContext.setBigQueryProjectId(projectId);
  this.bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);

  this.useSampleTables = useSampleTables;
}

Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0

6 votes

/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 */
@Override
public void runScript(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (String expression : expressions) {
        sqlContext.sql(expression);
    }
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

6 votes

@Test(expected = UnsupportedOperationException.class)
public void createJavaSchemaFromEmptyRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    when(singleRdd.first()).thenThrow(new UnsupportedOperationException());

    deepSparkContextSpy.createJavaSchemaRDD(config);
}

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Tags a list of sequences and writes the result to an output file with a
 * desired output format.
 * 
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	tag(input, outputFileName, outputFormat);
}

Source File: SparkSqlInterpreter.java From Explorer with Apache License 2.0

5 votes

public int getProgress() {
    SQLContext sqlc = getSparkInterpreter().getSQLContext();
    SparkContext sc = sqlc.sparkContext();
    JobProgressListener sparkListener = getSparkInterpreter().getJobProgressListener();
    int completedTasks = 0;
    int totalTasks = 0;

    DAGScheduler scheduler = sc.dagScheduler();
    HashSet<ActiveJob> jobs = scheduler.activeJobs();
    Iterator<ActiveJob> it = jobs.iterator();
    while (it.hasNext()) {
        ActiveJob job = it.next();
        String g = (String) job.properties().get("spark.jobGroup.id");
        if (jobGroup.equals(g)) {
            int[] progressInfo = null;
            if (sc.version().startsWith("1.0")) {
                progressInfo = getProgressFromStage_1_0x(sparkListener, job.finalStage());
            } else if (sc.version().startsWith("1.1") || sc.version().startsWith("1.2")) {
                progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage());
            } else {
                logger.warn("Spark {} getting progress information not supported" + sc.version());
                continue;
            }
            totalTasks += progressInfo[0];
            completedTasks += progressInfo[1];
        }
    }

    if (totalTasks == 0) {
        return 0;
    }
    return completedTasks * 100 / totalTasks;
}

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Tags a distributed list of sentences and writes the result to an output file with 
 * a desired output format.
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) {
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(sentences, schema);
	tag(input, outputFileName, outputFormat);
}

Source File: MetadataWriter.java From rdf2x with Apache License 2.0

5 votes

/**
 * @param sc        spark context to be used
 * @param persistor output persistor
 * @param rdfSchema schema storing information about classes and properties
 */
public MetadataWriter(JavaSparkContext sc, Persistor persistor, RdfSchema rdfSchema) {
    this.sql = new SQLContext(sc);
    this.persistor = persistor;
    this.rdfSchema = rdfSchema;
    this.stats = new ArrayList<>();
}

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Creates a data frame from a list of tagged sentences.
 * @param taggedSentences
 * @return a data frame of two columns: "sentence" and "partOfSpeech".
 */
public DataFrame createDataFrame(List<String> taggedSentences) {
	List<String> wordSequences = new LinkedList<String>();
	List<String> tagSequences = new LinkedList<String>();
	for (String taggedSentence : taggedSentences) {
		StringBuilder wordBuf = new StringBuilder();
		StringBuilder tagBuf = new StringBuilder();
		String[] tokens = taggedSentence.split("\\s+");
		for (String token : tokens) {
			String[] parts = token.split("/");
			if (parts.length == 2) {
				wordBuf.append(parts[0]);
				wordBuf.append(' ');
				tagBuf.append(parts[1]);
				tagBuf.append(' ');
			} else { // this token is "///"  
				wordBuf.append('/');
				wordBuf.append(' ');
				tagBuf.append('/');
				tagBuf.append(' ');
			}
		}
		wordSequences.add(wordBuf.toString().trim());
		tagSequences.add(tagBuf.toString().trim());
	}
	if (verbose) {
		System.out.println("Number of sentences = " + wordSequences.size());
	}
	List<Row> rows = new LinkedList<Row>();
	for (int i = 0; i < wordSequences.size(); i++) {
		rows.add(RowFactory.create(wordSequences.get(i), tagSequences.get(i)));
	}
	JavaRDD<Row> jrdd = jsc.parallelize(rows);
	StructType schema = new StructType(new StructField[]{
			new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
			new StructField("partOfSpeech", DataTypes.StringType, false, Metadata.empty())
		});
		
	return new SQLContext(jsc).createDataFrame(jrdd, schema);
}

Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0

5 votes

/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 * @return the row results acquired from the last executed expression
 */
@Override
public List<Row> runScriptReturnResults(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (int i = 0; i < expressions.length - 1; i++) {
        String expression = expressions[i];
        sqlContext.sql(expression);
    }

    List<Row> rows = sqlContext.sql(expressions[expressions.length - 1]).collectAsList();
    return rows;
}

Source File: RelationExtractorTest.java From rdf2x with Apache License 2.0

5 votes

/**
 * Test if expected directed relations are collected from a RDD of Instances
 */
@Test
public void testCollectRelations() {
    SQLContext sql = new SQLContext(jsc());

    RelationExtractor collector = new RelationExtractor(
            new RelationConfig(),
            jsc(),
            new ClassGraph()
    );

    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
    rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
    rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
    rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));

    // five -> one
    rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));

    DataFrame expected = sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );

    // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
    DataFrame result = collector.extractRelations(getTestRDD());

    assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
    assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}

Source File: InstanceRelationWriterTest.java From rdf2x with Apache License 2.0

5 votes

@Before
public void setUp() {
    sql = new SQLContext(jsc());
    uriIndex = new IndexMap<>(Arrays.asList(
            "http://example.com/a",
            "http://example.com/b",
            "http://example.com/c",
            "http://example.com/knows",
            "http://example.com/likes",
            "http://example.com/name",
            "http://example.com/age"
    ));

    rdfSchema = new RdfSchema(
            new RdfSchemaCollectorConfig(),
            new ClassGraph(),
            uriIndex,
            uriIndex,
            null
    );

    typeNames = new HashMap<>();
    typeNames.put("http://example.com/a", "a");
    typeNames.put("http://example.com/b", "b");
    typeNames.put("http://example.com/c", "c");

    config = new InstanceRelationWriterConfig();

    persistor = new DataFrameMapPersistor();
    result = persistor.getResultMap();
}

Source File: TestPerformanceRegression.java From chronix.spark with Apache License 2.0

5 votes

public static void main(String[] args) throws SolrServerException, IOException {

        ChronixSparkLoader loader = new ChronixSparkLoader();

        ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext();
        SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext());

        // BENCHMARK START ...............................
        long start = System.currentTimeMillis();
        for (int i = 0; i < LOOPS; i++) {

            //Load data into ChronixRDD
            ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext);

            //Some actions
            double mean = rdd.mean();
            double approxMean = rdd.approxMean();
            long observationCount = rdd.countObservations();
            double max = rdd.max();
            double min = rdd.min();
            Iterator<MetricTimeSeries> it = rdd.iterator();
            while (it.hasNext()) {
                MetricTimeSeries mts = it.next();
                System.out.print(".");
            }

            //DataFrame operations
            Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext);
            ds.count();
        }
        long stop = System.currentTimeMillis();
        // BENCHMARK STOP ...................................
        System.out.println("\nBenchmark duration: " + (stop - start) + " ms");

        chronixSparkContext.getSparkContext().close();
    }

Source File: TestHDFSParquetImporter.java From hudi with Apache License 2.0

5 votes

/**
 * Test successful insert and verify data consistency.
 */
@Test
public void testImportWithInsert() throws IOException, ParseException {
  try (JavaSparkContext jsc = getJavaSparkContext()) {
    insert(jsc);
    SQLContext sqlContext = new SQLContext(jsc);
    Dataset<Row> ds = HoodieClientTestUtils.read(jsc, basePath + "/testTarget", sqlContext, dfs, basePath + "/testTarget/*/*/*/*");

    List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
    List<HoodieTripModel> result = readData.stream().map(row ->
        new HoodieTripModel(row.getDouble(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4),
            row.getDouble(5), row.getDouble(6), row.getDouble(7)))
        .collect(Collectors.toList());

    List<HoodieTripModel> expected = insertData.stream().map(g ->
        new HoodieTripModel(Double.parseDouble(g.get("timestamp").toString()),
            g.get("_row_key").toString(),
            g.get("rider").toString(),
            g.get("driver").toString(),
            Double.parseDouble(g.get("begin_lat").toString()),
            Double.parseDouble(g.get("begin_lon").toString()),
            Double.parseDouble(g.get("end_lat").toString()),
            Double.parseDouble(g.get("end_lon").toString())))
        .collect(Collectors.toList());

    assertTrue(result.containsAll(expected) && expected.containsAll(result) && result.size() == expected.size());
  }
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

5 votes

@Test
public void textFileHDFSTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createHDFSRDD(config);
}

Source File: SaveModelDemo.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);
	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出
	dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加
	dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误
	dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖

	sc.close();
}

Source File: BatchProcessor.java From lambda-arch with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
    Properties prop = PropertyFileReader.readPropertyFile("iot-spark.properties");
    String file = prop.getProperty("com.iot.app.hdfs") + "iot-data-parque";
    String[] jars = {prop.getProperty("com.iot.app.jar")};

    JavaSparkContext sparkContext = getSparkContext(prop, jars);
    SQLContext sqlContext = new SQLContext(sparkContext);
    Dataset<Row> dataFrame = getDataFrame(sqlContext, file);
    JavaRDD<IoTData> rdd = dataFrame.javaRDD().map(getRowIoTDataFunction());
    BatchHeatMapProcessor processor = new BatchHeatMapProcessor();
    processor.processHeatMap(rdd);
    sparkContext.close();
    sparkContext.stop();
}

Source File: DataSparkLoader.java From toolbox with Apache License 2.0

5 votes

public static DataSpark open(SQLContext sqlContext, String path, String formatFile) throws Exception {

        //Load the data and store it into an object of class DataFrame
        DataFrame df = sqlContext.read().format(formatFile).load(path);

        //Create an AMIDST object for managing the data
        return loadSparkDataFrame(df);
    }

Source File: UserVisitAnalyze.java From UserActionAnalyzePlatform with Apache License 2.0

5 votes

private static void mock(JavaSparkContext context,SQLContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        MockData.mock(context,sc);
    }

}

Source File: UserVisitAnalyze.java From UserActionAnalyzePlatform with Apache License 2.0

5 votes

/**
 * 用于判断是否是生产环境
 * @param sc
 * @return
 */
public static SQLContext getSQLContext(SparkContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        return new SQLContext(sc);
    }
    return new HiveContext(sc);
}

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中，查询用户访问行为数据
	// 第一个限定：click_product_id，限定为不为空的访问行为，那么就代表着点击行为
	// 第二个限定：在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}

org.apache.spark.sql.SQLContext Java Examples