org.apache.spark.ml.feature.Tokenizer Java Examples
The following examples show how to use
org.apache.spark.ml.feature.Tokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CMM.java From vn.vitk with GNU General Public License v3.0 | 9 votes |
/** * Creates a processing pipeline. * @return a pipeline */ private Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("featureStrings") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer tagIndexer = new StringIndexer() .setInputCol("tag") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer}); return pipeline; }
Example #2
Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Creates a processing pipeline. * @return a pipeline */ protected Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer transitionIndexer = new StringIndexer() .setInputCol("transition") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer}); return pipeline; }
Example #3
Source File: JavaTfIdfExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTfIdfExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); Dataset<Row> featurizedData = hashingTF.transform(wordsData); // alternatively, CountVectorizer can also be used to get term frequency vectors IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); rescaledData.select("label", "features").show(); // $example off$ spark.stop(); }
Example #4
Source File: TestSparkMLDeriver.java From envelope with Apache License 2.0 | 5 votes |
private void generateAndSaveModel(String savePath) throws IOException { // Sourced from the Spark ML documentation and examples StructType trainingSchema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("id", DataTypes.LongType, false), DataTypes.createStructField("text", DataTypes.StringType, false), DataTypes.createStructField("label", DataTypes.DoubleType, false) )); Dataset<Row> training = Contexts.getSparkSession().createDataFrame(Lists.newArrayList( RowFactory.create(0L, "a b c d e spark", 1.0), RowFactory.create(1L, "b d", 0.0), RowFactory.create(2L, "spark f g h", 1.0), RowFactory.create(3L, "hadoop mapreduce", 0.0) ), trainingSchema); Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); PipelineModel model = pipeline.fit(training); model.write().overwrite().save(savePath); }
Example #5
Source File: JavaPipelineExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaPipelineExample") .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L, "spark f g h", 1.0), new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0) ), JavaLabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); // Fit the pipeline to training documents. PipelineModel model = pipeline.fit(training); // Prepare test documents, which are unlabeled. Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "spark hadoop spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); // Make predictions on test documents. Dataset<Row> predictions = model.transform(test); for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example #6
Source File: JavaModelSelectionViaCrossValidationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaModelSelectionViaCrossValidationExample") .getOrCreate(); // $example on$ // Prepare training documents, which are labeled. Dataset<Row> training = spark.createDataFrame(Arrays.asList( new JavaLabeledDocument(0L, "a b c d e spark", 1.0), new JavaLabeledDocument(1L, "b d", 0.0), new JavaLabeledDocument(2L,"spark f g h", 1.0), new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0), new JavaLabeledDocument(4L, "b spark who", 1.0), new JavaLabeledDocument(5L, "g d a y", 0.0), new JavaLabeledDocument(6L, "spark fly", 1.0), new JavaLabeledDocument(7L, "was mapreduce", 0.0), new JavaLabeledDocument(8L, "e spark program", 1.0), new JavaLabeledDocument(9L, "a e c l", 0.0), new JavaLabeledDocument(10L, "spark compile", 1.0), new JavaLabeledDocument(11L, "hadoop software", 0.0) ), JavaLabeledDocument.class); // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words"); HashingTF hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol()) .setOutputCol("features"); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.01); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {tokenizer, hashingTF, lr}); // We use a ParamGridBuilder to construct a grid of parameters to search over. // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. ParamMap[] paramGrid = new ParamGridBuilder() .addGrid(hashingTF.numFeatures(), new int[] {10, 100, 1000}) .addGrid(lr.regParam(), new double[] {0.1, 0.01}) .build(); // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. // This will allow us to jointly choose parameters for all Pipeline stages. // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. // Note that the evaluator here is a BinaryClassificationEvaluator and its default metric // is areaUnderROC. CrossValidator cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new BinaryClassificationEvaluator()) .setEstimatorParamMaps(paramGrid).setNumFolds(2); // Use 3+ in practice // Run cross-validation, and choose the best set of parameters. CrossValidatorModel cvModel = cv.fit(training); // Prepare test documents, which are unlabeled. Dataset<Row> test = spark.createDataFrame(Arrays.asList( new JavaDocument(4L, "spark i j k"), new JavaDocument(5L, "l m n"), new JavaDocument(6L, "mapreduce spark"), new JavaDocument(7L, "apache hadoop") ), JavaDocument.class); // Make predictions on test documents. cvModel uses the best model found (lrModel). Dataset<Row> predictions = cvModel.transform(test); for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2) + ", prediction=" + r.get(3)); } // $example off$ spark.stop(); }
Example #7
Source File: JavaTokenizerExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTokenizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(1, "I wish Java could use case classes"), RowFactory.create(2, "Logistic,regression,models,are,neat") ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); RegexTokenizer regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); spark.udf().register("countTokens", new UDF1<WrappedArray, Integer>() { @Override public Integer call(WrappedArray words) { return words.size(); } }, DataTypes.IntegerType); Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame); tokenized.select("sentence", "words") .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame); regexTokenized.select("sentence", "words") .withColumn("tokens", callUDF("countTokens", col("words"))).show(false); // $example off$ spark.stop(); }
Example #8
Source File: TokenizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
public TokenizerConverter(Tokenizer transformer){ super(transformer); }
Example #9
Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Trains a whitespace classifier model and save the resulting pipeline model * to an external file. * @param sentences a list of tokenized sentences. * @param pipelineModelFileName * @param numFeatures */ public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) { List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size()); int id = 0; for (String sentence : sentences) { sentence = sentence.trim(); for (int j = 0; j < sentence.length(); j++) { char c = sentence.charAt(j); if (c == ' ' || c == '_') { WhitespaceContext context = new WhitespaceContext(); context.setId(id++); context.setContext(extractContext(sentence, j)); context.setLabel(c == ' ' ? 0d : 1d); contexts.add(context); } } } JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts); DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class); df.show(false); System.out.println("N = " + df.count()); df.groupBy("label").count().show(); org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer() .setInputCol("context").setOutputCol("words"); HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures) .setInputCol(tokenizer.getOutputCol()).setOutputCol("features"); LogisticRegression lr = new LogisticRegression().setMaxIter(100) .setRegParam(0.01); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, hashingTF, lr }); model = pipeline.fit(df); try { model.write().overwrite().save(pipelineModelFileName); } catch (IOException e) { e.printStackTrace(); } DataFrame predictions = model.transform(df); predictions.show(); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); double accuracy = evaluator.evaluate(predictions); System.out.println("training accuracy = " + accuracy); LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2]; LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); double[] objectiveHistory = trainingSummary.objectiveHistory(); System.out.println("#(iterations) = " + objectiveHistory.length); for (double lossPerIteration : objectiveHistory) { System.out.println(lossPerIteration); } }
Example #10
Source File: HashingTFBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testHashingTF() { //prepare data List<Row> trainingData = Arrays.asList( RowFactory.create(1, 0.0, "Hi I heard about Spark"), RowFactory.create(2, 0.0, "I wish Java could use case classes"), RowFactory.create(3, 1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", DoubleType, false), createStructField("sentence", StringType, false), }); DataFrame sentenceData = sqlContext.createDataFrame(trainingData, schema); Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); DataFrame wordsData = tokenizer.transform(sentenceData); //train model in spark int numFeatures = 20; HashingTF sparkModel = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel, sentenceData); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = sparkModel.transform(wordsData).orderBy("id").select("id", "sentence", "words", "rawFeatures").collect(); for (Row row : sparkOutput) { String[] words = ((String) row.get(1)).toLowerCase().split(" "); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), words); transformer.transform(data); double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol()); double[] sparkOp = ((Vector) row.get(3)).toArray(); assertArrayEquals(transformedOp, sparkOp, EPSILON); } }
Example #11
Source File: TokenizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 3 votes |
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ Tokenizer transformer = getTransformer(); Feature feature = encoder.getOnlyFeature(transformer.getInputCol()); Apply apply = PMMLUtil.createApply(PMMLFunctions.LOWERCASE, feature.ref()); DerivedField derivedField = encoder.createDerivedField(FeatureUtil.createName("lowercase", feature), OpType.CATEGORICAL, DataType.STRING, apply); return Collections.singletonList(new DocumentFeature(encoder, derivedField, "\\s+")); }