org.apache.spark.sql.Row#getList

Source File: CommonAddressFeaturesBridgeTest.java From spark-transformers with Apache License 2.0

6 votes

private void assertCorrectness(Dataset<Row> rowDataset, Transformer transformer) {
	List<Row> sparkOutput = rowDataset.collectAsList();

	for (Row row : sparkOutput) {
		Map<String, Object> data = new HashMap<>();
		data.put("mergedAddress", row.get(0));

		List<Object> list = row.getList(1);
		String[] sanitizedAddress = new String[list.size()];
		for (int j = 0; j < sanitizedAddress.length; j++) {
			sanitizedAddress[j] = (String) list.get(j);
		}

		data.put("sanitizedAddress", sanitizedAddress);
		transformer.transform(data);

		assertEquals("number of words should be equals", row.get(2), data.get("numWords"));
		assertEquals("number of commas should be equals", row.get(3), data.get("numCommas"));
		assertEquals("numericPresent should be equals", row.get(4), data.get("numericPresent"));
		assertEquals("addressLength should be equals", row.get(5), data.get("addressLength"));
		assertEquals("favouredStart should be equals", row.get(6), data.get("favouredStart"));
		assertEquals("unfavouredStart should be equals", row.get(7), data.get("unfavouredStart"));
	}
}

Source File: PopularWordsEstimatorBridgeTest.java From spark-transformers with Apache License 2.0

6 votes

private void assertCorrectness(Dataset<Row> rowDataset, Transformer transformer) {
	List<Row> sparkOutput = rowDataset.collectAsList();
	for (Row row : sparkOutput) {
		List<Object> list = row.getList(0);
		String[] sanitizedAddress = new String[list.size()];
		for (int j = 0; j < sanitizedAddress.length; j++) {
			sanitizedAddress[j] = (String) list.get(j);
		}

		Map<String, Object> data = new HashMap<>();
		data.put("sanitizedAddress", sanitizedAddress);

		double expected = row.getDouble(1);
		transformer.transform(data);
		double actual = (double) data.get("commonFraction");

		assertEquals(expected, actual, 0.01);
	}
}

Source File: JavaWord2VecExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaWord2VecExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> documentDF = spark.createDataFrame(data, schema);

  // Learn a mapping from words to Vectors.
  Word2Vec word2Vec = new Word2Vec()
    .setInputCol("text")
    .setOutputCol("result")
    .setVectorSize(3)
    .setMinCount(0);

  Word2VecModel model = word2Vec.fit(documentDF);
  Dataset<Row> result = model.transform(documentDF);

  for (Row row : result.collectAsList()) {
    List<String> text = row.getList(0);
    Vector vector = (Vector) row.get(1);
    System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
  }
  // $example off$

  spark.stop();
}

Source File: TestProtobufTranslator.java From envelope with Apache License 2.0

5 votes

@Test
public void translateSingleRepeating() throws Exception {
  String descPath = TestProtobufTranslator.class.getResource(SINGLE_EXAMPLE).getPath();

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "protobuf");
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + 
      ProtobufSchema.DESCRIPTOR_FILEPATH_CONFIG, descPath);
  Config config = ConfigFactory.parseMap(configMap);

  ProtobufTranslator translator = new ProtobufTranslator();
  assertNoValidationFailures(translator, config);
  translator.configure(config);

  byte[] key = "foo".getBytes();
  byte[] payload = Files.readAllBytes(SINGLE_REPEATING.toPath());

  Row raw = TestingMessageFactory.get(key, DataTypes.BinaryType, payload, DataTypes.BinaryType);
  Iterable<Row> results = translator.translate(raw);
  System.out.println("results = " + results);

  assertThat(results.iterator().hasNext(), is(true));
  Row row = results.iterator().next();
  assertThat(row.getString(0), is("repeating message"));
  List<Row> nested = row.<Row>getList(18);
  assertThat(nested.size(), is(1));
  assertThat(nested.get(0).getString(0), is("nested"));
}

Source File: StringSanitizerBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testStringSanitizer() {

	//prepare data
	JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
			RowFactory.create(1, "Jyoti complex near Sananda clothes store; English Bazar; Malda;WB;India,"),
			RowFactory.create(2, "hallalli vinayaka tent road c/o B K vishwanath Mandya"),
			RowFactory.create(3, "M.sathish S/o devudu Lakshmi opticals Gokavaram bus stand Rajhamundry 9494954476")
	));

	StructType schema = new StructType(new StructField[]{
			new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
			new StructField("rawText", DataTypes.StringType, false, Metadata.empty())
	});
	Dataset<Row> dataset = spark.createDataFrame(rdd, schema);
	dataset.show();

	//train model in spark
	StringSanitizer sparkModel = new StringSanitizer()
			.setInputCol("rawText")
			.setOutputCol("token");

	//Export this model
	byte[] exportedModel = ModelExporter.export(sparkModel);

	//Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> pairs = sparkModel.transform(dataset).select("rawText", "token").collectAsList();

	for (Row row : pairs) {
		Map<String, Object> data = new HashMap<String, Object>();
		data.put(sparkModel.getInputCol(), row.getString(0));
		transformer.transform(data);

		String[] actual = (String[]) data.get(sparkModel.getOutputCol());

		List<String> actualList = Arrays.asList(actual);
		List<String> expected = row.getList(1);

		assertTrue("both should be same", actualList.equals(expected));
	}
}

Source File: ValueSetUdfs.java From bunsen with Apache License 2.0

3 votes

/**
 * Returns true if the given CodeableConcept row has a Coding belonging to the ValueSet having the
 * given reference name, or false otherwise.
 */
private static Boolean inValueSet(Row codeableRow,
    String referenceName,
    BroadcastableValueSets valueSets) {

  boolean found = false;

  if (codeableRow != null) {

    List<Row> codingArray = codeableRow.getList(1);

    if (codingArray != null) {

      for (Row coding : codingArray) {

        String system = coding.getAs("system");
        String code = coding.getAs("code");

        // If there exists a matching code, return true.
        if (valueSets.hasCode(referenceName, system, code)) {

          found = true;

          break;
        }
      }
    }
  }

  return found;
}

Java Code Examples for org.apache.spark.sql.Row#getList()