org.apache.spark.mllib.linalg.VectorUDT Java Examples
The following examples show how to use
org.apache.spark.mllib.linalg.VectorUDT.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EntitySalienceFeatureExtractorSpark.java From ambiverse-nlu with Apache License 2.0 | 5 votes |
/** * Extract a DataFrame ready for training or testing. * @param jsc * @param documents * @param sqlContext * @return * @throws ResourceInitializationException */ public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException { Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS"); Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES"); Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES"); TrainingSettings trainingSettings = getTrainingSettings(); FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor(); final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize(); JavaRDD<TrainingInstance> trainingInstances = documents.flatMap(s -> { TOTAL_DOCS.add(1); return fe.getTrainingInstances(s.getJCas(), trainingSettings.getFeatureExtractor(), trainingSettings.getPositiveInstanceScalingFactor()); }); StructType schema = new StructType(new StructField[]{ new StructField("docId", DataTypes.StringType, false, Metadata.empty() ), new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ), new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); JavaRDD<Row> withFeatures = trainingInstances.map(ti -> { if (ti.getLabel() == 1.0) { SALIENT_ENTITY_INSTANCES.add(1); } else { NON_SALIENT_ENTITY_INSTANCES.add(1); } Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize); return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei); }); return sqlContext.createDataFrame(withFeatures, schema); }
Example #2
Source File: FirstPrediction.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("First Prediction") .master("local").getOrCreate(); StructType schema = new StructType( new StructField[] { new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata .empty()), }); // TODO this example is not working yet }
Example #3
Source File: SchemaExporter.java From spark-transformers with Apache License 2.0 | 5 votes |
public static String exportToJson(Set<String> columns, StructType dfSchema) { //This would contain column name along with type of a dataframe List<Field> schema = new ArrayList<>(); for (String column : columns) { StructField field = dfSchema.fields()[ dfSchema.fieldIndex(column) ]; if (field.dataType() instanceof StringType) { schema.add(new Field(field.name(), STRING)); } else if (field.dataType() instanceof BooleanType) { schema.add(new Field(field.name(), BOOLEAN)); } else if (field.dataType() instanceof VectorUDT) { schema.add(new Field(field.name(), DOUBLE_ARRAY)); } else if (field.dataType() instanceof DoubleType || field.dataType() instanceof DecimalType || field.dataType() instanceof FloatType || field.dataType() instanceof IntegerType || field.dataType() instanceof LongType || field.dataType() instanceof ShortType) { schema.add(new Field(field.name(), DOUBLE)); } else if (field.dataType() instanceof ArrayType) { if(((ArrayType)field.dataType()).elementType() instanceof StringType) { schema.add(new Field(field.name(), STRING_ARRAY)); }else if(((ArrayType)field.dataType()).elementType() instanceof DoubleType) { schema.add(new Field(field.name(), DOUBLE_ARRAY)); }else { throw new UnsupportedOperationException("Cannot support data of type " + field.dataType()); } } else { throw new UnsupportedOperationException("Cannot support data of type " + field.dataType()); } } return gson.toJson(schema); }
Example #4
Source File: SchemaExporter.java From spark-transformers with Apache License 2.0 | 5 votes |
public static String exportSchemaToJson(StructType dfSchema) { //This would contain column name along with type of a dataframe List<Field> schema = new ArrayList<>(); for (StructField field : dfSchema.fields()) { if (field.dataType() instanceof StringType) { schema.add(new Field(field.name(), STRING)); } else if (field.dataType() instanceof BooleanType) { schema.add(new Field(field.name(), BOOLEAN)); } else if (field.dataType() instanceof VectorUDT) { schema.add(new Field(field.name(), DOUBLE_ARRAY)); } else if (field.dataType() instanceof DoubleType || field.dataType() instanceof DecimalType || field.dataType() instanceof FloatType || field.dataType() instanceof IntegerType || field.dataType() instanceof LongType || field.dataType() instanceof ShortType) { schema.add(new Field(field.name(), DOUBLE)); } else if (field.dataType() instanceof ArrayType) { if(((ArrayType)field.dataType()).elementType() instanceof StringType) { schema.add(new Field(field.name(), STRING_ARRAY)); }else if(((ArrayType)field.dataType()).elementType() instanceof DoubleType) { schema.add(new Field(field.name(), DOUBLE_ARRAY)); }else { throw new UnsupportedOperationException("Cannot support data of type " + field.dataType()); } } else { throw new UnsupportedOperationException("Cannot support data of type " + field.dataType()); } } return gson.toJson(schema); }
Example #5
Source File: VectorBinarizerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testVectorBinarizerDense() { // prepare data JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(0d, 1d, new DenseVector(new double[]{-2d, -3d, -4d, -1d, 6d, -7d, 8d, 0d, 0d, 0d, 0d, 0d})), RowFactory.create(1d, 2d, new DenseVector(new double[]{4d, -5d, 6d, 7d, -8d, 9d, -10d, 0d, 0d, 0d, 0d, 0d})), RowFactory.create(2d, 3d, new DenseVector(new double[]{-5d, 6d, -8d, 9d, 10d, 11d, 12d, 0d, 0d, 0d, 0d, 0d})) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); VectorBinarizer vectorBinarizer = new VectorBinarizer() .setInputCol("vector1") .setOutputCol("binarized") .setThreshold(2d); //Export this model byte[] exportedModel = ModelExporter.export(vectorBinarizer, df); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorBinarizer.getInputCol(), ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorBinarizer.getOutputCol()); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }
Example #6
Source File: SchemaExporterTest.java From spark-transformers with Apache License 2.0 | 5 votes |
/** * Output : [{"name":"id","datatype":"double"},{"name":"label","datatype":"double"},{"name":"features","datatype":"double []"}] * */ @Test public void testSchema1() { StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); System.out.println(SchemaExporter.exportSchemaToJson(schema)); }
Example #7
Source File: SchemaExporterTest.java From spark-transformers with Apache License 2.0 | 5 votes |
/** * Output : [{"name":"id","datatype":"double"},{"name":"value1","datatype":"double"},{"name":"vector1","datatype":"double []"}] * */ @Test public void testSchema3() { StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); System.out.println(SchemaExporter.exportSchemaToJson(schema)); }
Example #8
Source File: SchemaExporterTest.java From spark-transformers with Apache License 2.0 | 5 votes |
/** * Output : [{"name":"features","datatype":"double []"},{"name":"id","datatype":"double"}] * */ @Test public void testColumnExport1() { StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); System.out.println(SchemaExporter.exportToJson(new HashSet<String>(Arrays.asList("id", "features")),schema)); }
Example #9
Source File: SchemaExporterTest.java From spark-transformers with Apache License 2.0 | 5 votes |
/** * Output : [{"name":"id","datatype":"double"},{"name":"vector1","datatype":"double []"}] * */ @Test public void testColumnExport3() { StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); System.out.println(SchemaExporter.exportToJson(new HashSet<String>(Arrays.asList("id", "vector1")),schema)); }
Example #10
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
/** * Extract a DataFrame ready for training or testing. * @param jsc * @param documents * @param sqlContext * @return * @throws ResourceInitializationException */ public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException { Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS"); Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES"); Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES"); TrainingSettings trainingSettings = getTrainingSettings(); final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator()); FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor(); final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize(); JavaRDD<TrainingInstance> trainingInstances = documents .map(s -> { TOTAL_DOCS.add(1); Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class); String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId(); tmpLogger.info("Processing document {}.", docId); //Before processing the document through the Disambiguation Pipeline, add the AIDA settings // in each document. SparkUimaUtils.addSettingsToJCas(s.getJCas(), trainingSettings.getDocumentCoherent(), trainingSettings.getDocumentConfidenceThreshold()); return ae.process(s); }) .flatMap(s -> fe.getTrainingInstances(s.getJCas(), trainingSettings.getFeatureExtractor(), trainingSettings.getPositiveInstanceScalingFactor())); StructType schema = new StructType(new StructField[]{ new StructField("docId", DataTypes.StringType, false, Metadata.empty() ), new StructField("entity", DataTypes.StringType, false, Metadata.empty() ), new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); JavaRDD<Row> withFeatures = trainingInstances.map(ti -> { if (ti.getLabel() == 1.0) { SALIENT_ENTITY_INSTANCES.add(1); } else { NON_SALIENT_ENTITY_INSTANCES.add(1); } Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize); return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei); }); return sqlContext.createDataFrame(withFeatures, schema); }
Example #11
Source File: VectorBinarizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testVectorBinarizerSparse() { // prepare data int[] sparseArray1 = {5, 6, 11, 4, 7, 9, 8, 14, 13}; double[] sparseArray1Values = {-5d, 7d, 1d, -2d, -4d, -1d, 31d, -1d, -3d}; int[] sparseArray2 = {2, 6, 1}; double[] sparseArray2Values = {1d, 11d, 2d}; int[] sparseArray3 = {4, 6, 1}; double[] sparseArray3Values = {52d, 71d, 11d}; int[] sparseArray4 = {4, 1, 2}; double[] sparseArray4Values = {17d, 7d, 9d}; JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(3d, 4d, new SparseVector(20, sparseArray1, sparseArray1Values)), RowFactory.create(4d, 5d, new SparseVector(20, sparseArray2, sparseArray2Values)), RowFactory.create(5d, 5d, new SparseVector(20, sparseArray3, sparseArray3Values)), RowFactory.create(6d, 5d, new SparseVector(20, sparseArray4, sparseArray4Values)) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); VectorBinarizer vectorBinarizer = new VectorBinarizer() .setInputCol("vector1") .setOutputCol("binarized"); //Export this model byte[] exportedModel = ModelExporter.export(vectorBinarizer, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorBinarizer.getInputCol(), ((SparseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorBinarizer.getOutputCol()); assertArrayEquals(output, ((SparseVector)row.get(3)).toArray(), 0d); } }
Example #12
Source File: VectorAssemblerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testVectorAssembler() { // prepare data JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})), RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})), RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})), RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})), RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d})) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); VectorAssembler vectorAssembler = new VectorAssembler() .setInputCols(new String[]{"value1", "vector1"}) .setOutputCol("feature"); //Export this model byte[] exportedModel = ModelExporter.export(vectorAssembler, null); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collect(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorAssembler.getInputCols()[0], row.get(1)); data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorAssembler.getOutputCol()); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }
Example #13
Source File: ChiSqSelectorBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testChiSqSelector() { // prepare data JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(0d, 0d, new DenseVector(new double[]{8d, 7d, 0d})), RowFactory.create(1d, 1d, new DenseVector(new double[]{0d, 9d, 6d})), RowFactory.create(2d, 1d, new DenseVector(new double[]{0.0d, 9.0d, 8.0d})), RowFactory.create(3d, 2d, new DenseVector(new double[]{8.0d, 9.0d, 5.0d})) )); double[] preFilteredData = {0.0d, 6.0d, 8.0d, 5.0d}; StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); ChiSqSelector chiSqSelector = new ChiSqSelector(); chiSqSelector.setNumTopFeatures(1); chiSqSelector.setFeaturesCol("features"); chiSqSelector.setLabelCol("label"); chiSqSelector.setOutputCol("output"); ChiSqSelectorModel chiSqSelectorModel = chiSqSelector.fit(df); //Export this model byte[] exportedModel = ModelExporter.export(chiSqSelectorModel, null); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = chiSqSelectorModel.transform(df).orderBy("id").select("id", "label", "features", "output").collect(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(chiSqSelectorModel.getFeaturesCol(), ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(chiSqSelectorModel.getOutputCol()); System.out.println(Arrays.toString(output)); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }
Example #14
Source File: ProbabilityColumnProducer.java From jpmml-evaluator-spark with GNU Affero General Public License v3.0 | 4 votes |
@Override public StructField init(Evaluator evaluator){ return DataTypes.createStructField(getColumnName(), new VectorUDT(), false); }