Java Code Examples for org.apache.spark.sql.Dataset#select()
The following examples show how to use
org.apache.spark.sql.Dataset#select() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProjectRestrictMapFunction.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) { ProjectRestrictOperation op = (ProjectRestrictOperation) operationContext.getOperation(); Dataset<Row> df = null; // TODO: Enable the commented try-catch block after regression testing. // This would be a safeguard against unanticipated exceptions: // org.apache.spark.sql.catalyst.parser.ParseException // org.apache.spark.sql.AnalysisException // ... which may occur if the Splice parser fails to detect a // SQL expression which SparkSQL does not support. if (op.hasExpressions()) { // try { df = input.selectExpr(op.getExpressions()); return Pair.newPair(df, context); // } // catch (Exception e) { // } } int[] mapping = op.projectMapping; Column[] columns = new Column[mapping.length]; for (int i = 0; i < mapping.length; ++i) { columns[i] = input.col("c" + (mapping[i] - 1)); } df = input.select(columns); return Pair.newPair(df, context); }
Example 2
Source File: SparkPredictionServiceRunner.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
public Dataset<Row> run(Dataset dataset) { //only use configured variables for pipeline Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(this.sparkRunnerConfig); List<String> predictionVars = configuration.getModelPredictionConfiguration().getPredictionVariables(); List<Column> usedColumns = new ArrayList<>(); for(String var : predictionVars) { usedColumns.add(new Column(var)); } dataset = dataset.select(BpmnaiUtils.getInstance().asSeq(usedColumns)); //go through pipe elements // Define processing steps to run final PreprocessingRunner preprocessingRunner = new PreprocessingRunner(); for(PipelineStep ps : pipelineManager.getOrderedPipeline()) { preprocessingRunner.addPreprocessorStep(ps); } // Run processing runner Dataset<Row> resultDataset = preprocessingRunner.run(dataset, this.sparkRunnerConfig); writeConfig(); return resultDataset; }
Example 3
Source File: ParseJSONDeriver.java From envelope with Apache License 2.0 | 6 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { String parsedStructTemporaryFieldName = "__parsed_json"; Dataset<Row> dependency = dependencies.get(stepName); Dataset<Row> parsed = dependency.select( functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName)); if (asStruct) { return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName); } else { for (StructField parsedField : schema.fields()) { parsed = parsed.withColumn( parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name())); } return parsed.drop(parsedStructTemporaryFieldName); } }
Example 4
Source File: AbstractConceptMaps.java From bunsen with Apache License 2.0 | 6 votes |
/** * Writes mapping records to a table. This class ensures the columns and partitions are mapped * properly, and is a workaround similar to the problem described <a * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>. * * @param mappings a dataset of mapping records * @param tableName the table to write them to */ private static void writeMappingsToTable(Dataset<Mapping> mappings, String tableName) { // Note the last two columns here must be the partitioned-by columns // in order and in lower case for Spark to properly match // them to the partitions. Dataset<Row> orderedColumnDataset = mappings.select("sourceValueSet", "targetValueSet", "sourceSystem", "sourceValue", "targetSystem", "targetValue", "equivalence", "conceptmapuri", "conceptmapversion"); orderedColumnDataset .write() .insertInto(tableName); }
Example 5
Source File: AdvancedSearchDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
private static Dataset<Row> getEntityToChainId() throws IOException { // get entityID to strandId mapping String query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly"; Dataset<Row> mapping = PdbjMineDataset.getDataset(query); // split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows mapping = mapping.withColumn("chainId", split(mapping.col("pdbx_strand_id"), ",")); mapping = mapping.withColumn("chainId", explode(col("chainId"))); // create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A mapping = mapping.withColumn("structureChainId", concat_ws(".", col("structureId"), col("chainId"))); return mapping.select("entity_id", "structureId", "structureChainId"); }
Example 6
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 5 votes |
public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) { ArrayList<String> columnToSelect = new ArrayList<String>(); for(int i = 1; i < columns.size(); i++) { columnToSelect.add(columns.get(i)); } return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList()); }
Example 7
Source File: SelectDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { dependencyCheck(dependencies); Dataset<Row> sourceStep = dependencies.get(stepName); if (useIncludeFields){ if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){ throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } String firstCol = includeFields.get(0); includeFields.remove(0); return sourceStep.select(firstCol, includeFields.toArray(new String[0])); } else { if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){ throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq()); } }
Example 8
Source File: Hierarchies.java From bunsen with Apache License 2.0 | 5 votes |
/** * Writes ancestor records to a table. This class ensures the columns and partitions are mapped * properly, and is a workaround similar to the problem described <a * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>. * * @param ancestors a dataset of ancestor records * @param tableName the table to write them to */ private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) { Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem", "descendantValue", "ancestorSystem", "ancestorValue", "uri", "version"); orderedColumnDataset.write() .mode(SaveMode.ErrorIfExists) .insertInto(tableName); }
Example 9
Source File: AbstractValueSets.java From bunsen with Apache License 2.0 | 5 votes |
/** * Writes value records to a table. This class ensures the columns and partitions are mapped * properly, and is a workaround similar to the problem described <a * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>. * * @param values a dataset of value records * @param tableName the table to write them to */ private static void writeValuesToTable(Dataset<Value> values, String tableName) { // Note the last two columns here must be the partitioned-by columns in order and in lower case // for Spark to properly match them to the partitions Dataset<Row> orderColumnDataset = values.select("system", "version", "value", "valueseturi", "valuesetversion"); orderColumnDataset.write() .mode(SaveMode.ErrorIfExists) .insertInto(tableName); }
Example 10
Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 5 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType HOUSES_SCHEMA = new StructType() .add("House", LongType, true) .add("Taxes", LongType, true) .add("Bedrooms", LongType, true) .add("Baths", FloatType, true) .add("Quadrant", LongType, true) .add("NW", StringType, true) .add("Price($)", LongType, false) .add("Size(sqft)", LongType, false) .add("lot", LongType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> housesDF = sparkSession.read() .schema(HOUSES_SCHEMA) .json(HOUSES_FILE_PATH); // Gathering Data Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), col("Bedrooms"), col("Baths"), col("Size(sqft)"), col("Price($)")); // Data Preparation Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label"); Imputer imputer = new Imputer() // .setMissingValue(1.0d) .setInputCols(new String[] { "Baths" }) .setOutputCols(new String[] { "~Baths~" }); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" }) .setOutputCol("features"); // Choosing a Model LinearRegression linearRegression = new LinearRegression(); linearRegression.setMaxIter(1000); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] { imputer, assembler, linearRegression }); // Training The Data Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 }); Dataset<Row> trainDF = splitDF[0]; Dataset<Row> evaluationDF = splitDF[1]; PipelineModel pipelineModel = pipeline.fit(trainDF); // Evaluation Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF); predictionsDF.show(false); Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), col("prediction")); RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2"); RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse"); double r2 = evaluteR2.evaluate(forEvaluationDF); double rmse = evaluteRMSE.evaluate(forEvaluationDF); logger.info("---------------------------"); logger.info("R2 =" + r2); logger.info("RMSE =" + rmse); logger.info("---------------------------"); }
Example 11
Source File: AdvancedSearchDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Runs an RCSB PDB Advanced Search web service using an XML query description. * The returned dataset contains the following fields dependent on the query type: * <pre> * structureId, e.g., 1STP * structureChainId, e.g., 4HHB.A * ligandId, e.g., HEM * </pre> * * @param xmlQuery RCSB PDB Advanced Query XML * @return dataset of ids * @throws IOException */ public static Dataset<Row> getDataset(String xmlQuery) throws IOException { // run advanced query List<String> results = AdvancedQueryService.postQuery(xmlQuery); // convert list of lists to a dataframe SparkSession spark = SparkSession.builder().getOrCreate(); // handle 3 types of results based on length of string: // structureId: 4 (e.g., 4HHB) // structureEntityId: > 4 (e.g., 4HHB:1) // entityId: < 4 (e.g., HEM) Dataset<Row> ds = null; if (results.size() > 0) { if (results.get(0).length() > 4) { ds = spark.createDataset(results, Encoders.STRING()).toDF("structureEntityId"); // if results contain an entity id, e.g., 101M:1, then map entityId to structureChainId ds = ds.withColumn("structureId", substring_index(col("structureEntityId"), ":", 1)); ds = ds.withColumn("entityId", substring_index(col("structureEntityId"), ":", -1)); Dataset<Row> mapping = getEntityToChainId(); ds = ds.join(mapping, ds.col("structureId").equalTo(mapping.col("structureId")).and(ds.col("entityId").equalTo(mapping.col("entity_id")))); ds = ds.select("structureChainId"); } else if (results.get(0).length() < 4) { ds = spark.createDataset(results, Encoders.STRING()).toDF("ligandId"); } else { ds = spark.createDataset(results, Encoders.STRING()).toDF("structureId"); } } return ds; }
Example 12
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 5 votes |
public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) { ArrayList<String> columnToSelect = new ArrayList<String>(); for(int i = 1; i < columns.size(); i++) { columnToSelect.add(columns.get(i)); } return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList()); }
Example 13
Source File: JavaMultilayerPerceptronClassifierExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMultilayerPerceptronClassifierExample") .getOrCreate(); // $example on$ // Load training data String path = "data/mllib/sample_multiclass_classification_data.txt"; Dataset<Row> dataFrame = spark.read().format("libsvm").load(path); // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; Dataset<Row> test = splits[1]; // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) int[] layers = new int[] {4, 5, 4, 3}; // create the trainer and set its parameters MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100); // train the model MultilayerPerceptronClassificationModel model = trainer.fit(train); // compute accuracy on the test set Dataset<Row> result = model.transform(test); Dataset<Row> predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy"); System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)); // $example off$ spark.stop(); }
Example 14
Source File: JavaEstimatorTransformerParamExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaEstimatorTransformerParamExample") .getOrCreate(); // $example on$ // Prepare training data. List<Row> dataTraining = Arrays.asList( RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)), RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)), RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5)) ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(dataTraining, schema); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); // We may set parameters using setter methods. lr.setMaxIter(10).setRegParam(0.01); // Learn a LogisticRegression model. This uses the parameters stored in lr. LogisticRegressionModel model1 = lr.fit(training); // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). // This prints the parameter (name: value) pairs, where names are unique IDs for this // LogisticRegression instance. System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); // We may alternatively specify parameters using a ParamMap. ParamMap paramMap = new ParamMap() .put(lr.maxIter().w(20)) // Specify 1 Param. .put(lr.maxIter(), 30) // This overwrites the original maxIter. .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap() .put(lr.probabilityCol().w("myProbability")); // Change output column name ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. // paramMapCombined overrides all parameters set earlier via lr.set* methods. LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. List<Row> dataTest = Arrays.asList( RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)), RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)), RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5)) ); Dataset<Row> test = spark.createDataFrame(dataTest, schema); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. Dataset<Row> results = model2.transform(test); Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); for (Row r: rows.collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example 15
Source File: SparkMultiClassClassifier.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * Dataset must at least contain the following two columns: * label: the class labels * features: feature vector * @param data * @return map with metrics */ public Map<String,String> fit(Dataset<Row> data) { int classCount = (int)data.select(label).distinct().count(); StringIndexerModel labelIndexer = new StringIndexer() .setInputCol(label) .setOutputCol("indexedLabel") .fit(data); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; String[] labels = labelIndexer.labels(); System.out.println(); System.out.println("Class\tTrain\tTest"); for (String l: labels) { System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count() + "\t" + testData.select(label).filter(label + " = '" + l + "'").count()); } // Set input columns predictor .setLabelCol("indexedLabel") .setFeaturesCol("features"); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and forest in a Pipeline Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, predictor, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData).cache(); // Display some sample predictions System.out.println(); System.out.println("Sample predictions: " + predictor.getClass().getSimpleName()); predictions.sample(false, 0.1, seed).show(25); predictions = predictions.withColumnRenamed(label, "stringLabel"); predictions = predictions.withColumnRenamed("indexedLabel", label); // collect metrics Dataset<Row> pred = predictions.select("prediction",label); Map<String,String> metrics = new LinkedHashMap<>(); metrics.put("Method", predictor.getClass().getSimpleName()); if (classCount == 2) { BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred); metrics.put("AUC", Float.toString((float)b.areaUnderROC())); } MulticlassMetrics m = new MulticlassMetrics(pred); metrics.put("F", Float.toString((float)m.weightedFMeasure())); metrics.put("Accuracy", Float.toString((float)m.accuracy())); metrics.put("Precision", Float.toString((float)m.weightedPrecision())); metrics.put("Recall", Float.toString((float)m.weightedRecall())); metrics.put("False Positive Rate", Float.toString((float)m.weightedFalsePositiveRate())); metrics.put("True Positive Rate", Float.toString((float)m.weightedTruePositiveRate())); metrics.put("", "\nConfusion Matrix\n" + Arrays.toString(labels) +"\n" + m.confusionMatrix().toString()); return metrics; }
Example 16
Source File: ProteinFoldDatasetCreator.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 1) { System.err.println("Usage: " + ProteinFoldDatasetCreator.class.getSimpleName() + " <dataset output file"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(ProteinFoldDatasetCreator.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get secondary structure content Dataset<Row> data = SecondaryStructureExtractor.getDataset(pdb); // classify chains by secondary structure type double minThreshold = 0.05; double maxThreshold = 0.15; data = addProteinFoldType(data, minThreshold, maxThreshold); // create a binary classification dataset data = data.filter("foldType = 'alpha' OR foldType = 'beta'").cache(); // create a three-state classification model (alpha, beta, alpha+beta) // data = data.filter("foldType != 'other'").cache(); // add Word2Vec encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int n = 2; int windowSize = 11; int vectorSize = 50; data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize); data.printSchema(); data.show(25); // keep only a subset of relevant fields for further processing data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features"); data.write().mode("overwrite").format("parquet").save(args[0]); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example 17
Source File: UDFExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Build a Spark Session SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse") .appName("EdgeBuilder") .getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); // Read the CSV data Dataset<Row> emp_ds = sparkSession.read() .format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load("src/main/resources/employee.txt"); UDF2 calcDays=new CalcDaysUDF(); //Registering the UDFs in Spark Session created above sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType); emp_ds.createOrReplaceTempView("emp_ds"); emp_ds.printSchema(); emp_ds.show(); sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show(); //Instantiate UDAF AverageUDAF calcAvg= new AverageUDAF(); //Register UDAF to SparkSession sparkSession.udf().register("calAvg", calcAvg); //Use UDAF sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show(); // TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF(); Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class)); emf.printSchema(); emf.show(); TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe"); Dataset<Double> result = emf.select(averageSalary); result.show(); }
Example 18
Source File: JavaEstimatorTransformerParamExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse") .appName("JavaEstimatorTransformerParamExample") .getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); // $example on$ // Prepare training data. List<Row> dataTraining = Arrays.asList( RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)), RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)), RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5)) ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(dataTraining, schema); // Create a LogisticRegression instance. This instance is an Estimator. LogisticRegression lr = new LogisticRegression(); // Print out the parameters, documentation, and any default values. System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n"); // We may set parameters using setter methods. lr.setMaxIter(10).setRegParam(0.01); // Learn a LogisticRegression model. This uses the parameters stored in lr. LogisticRegressionModel model1 = lr.fit(training); // Since model1 is a Model (i.e., a Transformer produced by an Estimator), // we can view the parameters it used during fit(). // This prints the parameter (name: value) pairs, where names are unique IDs for this // LogisticRegression instance. System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); // We may alternatively specify parameters using a ParamMap. ParamMap paramMap = new ParamMap() .put(lr.maxIter().w(20)) // Specify 1 Param. .put(lr.maxIter(), 30) // This overwrites the original maxIter. .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params. // One can also combine ParamMaps. ParamMap paramMap2 = new ParamMap() .put(lr.probabilityCol().w("myProbability")); // Change output column name ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2); // Now learn a new model using the paramMapCombined parameters. // paramMapCombined overrides all parameters set earlier via lr.set* methods. LogisticRegressionModel model2 = lr.fit(training, paramMapCombined); System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); // Prepare test documents. List<Row> dataTest = Arrays.asList( RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)), RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)), RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5)) ); Dataset<Row> test = spark.createDataFrame(dataTest, schema); // Make predictions on test documents using the Transformer.transform() method. // LogisticRegression.transform will only use the 'features' column. // Note that model2.transform() outputs a 'myProbability' column instead of the usual // 'probability' column since we renamed the lr.probabilityCol parameter previously. Dataset<Row> results = model2.transform(test); Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction"); for (Row r: rows.collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example 19
Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir", "file:///E:/sumitK/Hadoop/warehouse") .appName("BikeRentalPrediction").getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //We use the sqlContext.read method to read the data and set a few options: // 'format': specifies the Spark CSV data source // 'header': set to true to indicate that the first line of the CSV data file is a header // The file is called 'hour.csv'. Dataset<Row> ds=sparkSession.read() .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") .option("header", "true") .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv"); ds.cache(); ds.select("season").show();; ds.show(); System.out.println("Our dataset has rows :: "+ ds.count()); Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered"); df.printSchema(); //col("...") is preferable to df.col("...") Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType), col("yr").cast(DataTypes.IntegerType), col("mnth").cast(DataTypes.IntegerType), col("hr").cast(DataTypes.IntegerType), col("holiday").cast(DataTypes.IntegerType), col("weekday").cast(DataTypes.IntegerType), col("workingday").cast(DataTypes.IntegerType), col("weathersit").cast(DataTypes.IntegerType), col("temp").cast(DataTypes.IntegerType), col("atemp").cast(DataTypes.IntegerType), col("hum").cast(DataTypes.IntegerType), col("windspeed").cast(DataTypes.IntegerType), col("cnt").cast(DataTypes.IntegerType)); dformatted.printSchema(); Dataset<Row>[] data= dformatted.randomSplit(new double[]{0.7,0.3}); System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count()); /// //removing 'cnt' cloumn and then forming str array String[] featuresCols = dformatted.drop("cnt").columns(); for(String str:featuresCols){ System.out.println(str+" :: "); } //This concatenates all feature columns into a single feature vector in a new column "rawFeatures". VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures"); //This identifies categorical features and indexes them. VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4); //Takes the "features" column and learns to predict "cnt" GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt"); // Define a grid of hyperparameters to test: // - maxDepth: max depth of each decision tree in the GBT ensemble // - maxIter: iterations, i.e., number of trees in each GBT ensemble // In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). ParamMap[] paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build(); // We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol()); // # Declare the CrossValidator, which runs model tuning for us. CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv}); PipelineModel pipelineModel=pipeline.fit(data[0]); Dataset<Row> predictions = pipelineModel.transform(data[1]); predictions.show(); //predictions.select("cnt", "prediction", *featuresCols); }
Example 20
Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType RSVP_SCHEMA = new StructType() .add("event", new StructType() .add("event_id", StringType, true) .add("event_name", StringType, true) .add("event_url", StringType, true) .add("time", LongType, true)) .add("group", new StructType() .add("group_city", StringType, true) .add("group_country", StringType, true) .add("group_id", LongType, true) .add("group_lat", DoubleType, true) .add("group_lon", DoubleType, true) .add("group_name", StringType, true) .add("group_state", StringType, true) .add("group_topics", DataTypes.createArrayType( new StructType() .add("topicName", StringType, true) .add("urlkey", StringType, true)), true) .add("group_urlname", StringType, true)) .add("guests", LongType, true) .add("member", new StructType() .add("member_id", LongType, true) .add("member_name", StringType, true) .add("photo", StringType, true)) .add("mtime", LongType, true) .add("response", StringType, true) .add("rsvp_id", LongType, true) .add("venue", new StructType() .add("lat", DoubleType, true) .add("lon", DoubleType, true) .add("venue_id", LongType, true) .add("venue_name", StringType, true)) .add("visibility", StringType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession spark = SparkSession .builder() .config(conf) .getOrCreate(); PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH); Dataset<Row> meetupStream = spark.readStream() .format(KAFKA_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); Dataset<Row> gatheredDF = meetupStream.select( (from_json(col("value").cast("string"), RSVP_SCHEMA)) .alias("rsvp")) .alias("meetup") .select("meetup.*"); Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull()); Dataset<Row> preparedDF = filteredDF.select( col("rsvp.group.group_city"), col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), col("rsvp.response") ); preparedDF.printSchema(); Dataset<Row> predictionDF = pipelineModel.transform(preparedDF); StreamingQuery query = predictionDF.writeStream() .format(JSON_FORMAT) .option("path", RESULT_FOLDER_PATH) .option("checkpointLocation", CHECKPOINT_LOCATION) .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS)) .option("truncate", false) .start(); query.awaitTermination(); }