org.apache.spark.sql.types.DataTypes Java Exaples

Source File: MLContextUtil.java From systemds with Apache License 2.0

7 votes

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}

Source File: DataFrames.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}

Source File: NGlobalDictionaryV2Test.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}

Source File: InstanceRelationWriter.java From rdf2x with Apache License 2.0

6 votes

private DataType getDataType(int type) {
    switch (type) {
        case LiteralType.BOOLEAN:
            return DataTypes.BooleanType;
        case LiteralType.STRING:
            return DataTypes.StringType;
        case LiteralType.FLOAT:
            return DataTypes.FloatType;
        case LiteralType.DOUBLE:
            return DataTypes.DoubleType;
        case LiteralType.INTEGER:
            return DataTypes.IntegerType;
        case LiteralType.LONG:
            return DataTypes.LongType;
        case LiteralType.DATETIME:
            // datetime not supported due to timezone issues with java.sql.Timestamp
            // check the InstanceAggregator for more info
            return DataTypes.StringType;
    }
    throw new NotImplementedException("Not able to write literal type " + type);
}

Source File: TestRowUtils.java From envelope with Apache License 2.0

6 votes

@Test
public void testRemoveOneField() {
  StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true);
  StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true);
  StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true);
  StructType removeSchema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3));
  Row remove = new RowWithSchema(removeSchema, "hello", 1, 1.0);

  Row removed = RowUtils.remove(remove, "field2");

  Row expected = new RowWithSchema(
      DataTypes.createStructType(Lists.newArrayList(field1, field3)),
      "hello", 1.0);

  assertEquals(expected, removed);
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}

Source File: SimpleFeatureMapper.java From geowave with Apache License 2.0

6 votes

@Override
public Row call(final SimpleFeature feature) throws Exception {
  final Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
    final Object fieldObj = feature.getAttribute(i);
    if (fieldObj != null) {
      final StructField structField = schema.apply(i);
      if (structField.name().equals("geom")) {
        fields[i] = fieldObj;
      } else if (structField.dataType() == DataTypes.TimestampType) {
        fields[i] = new Timestamp(((Date) fieldObj).getTime());
      } else if (structField.dataType() != null) {
        fields[i] = fieldObj;
      } else {
        LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
      }
    }
  }

  return new GenericRowWithSchema(fields, schema);
}

Source File: TestMorphlineTranslator.java From envelope with Apache License 2.0

6 votes

@Test (expected = MorphlineCompilationException.class)
public void invalidCommand() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "invalid-command");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType,
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}

Source File: SchemaConverter.java From geowave with Apache License 2.0

6 votes

public static StructType schemaFromFeatureType(final SimpleFeatureType featureType) {
  final List<StructField> fields = new ArrayList<>();

  for (final AttributeDescriptor attrDesc : featureType.getAttributeDescriptors()) {
    final SimpleFeatureDataType sfDataType = attrDescToDataType(attrDesc);

    final String fieldName = (sfDataType.isGeom() ? "geom" : attrDesc.getName().getLocalPart());

    final StructField field =
        DataTypes.createStructField(fieldName, sfDataType.getDataType(), true);

    fields.add(field);
  }

  if (fields.isEmpty()) {
    LOGGER.error("Feature type produced empty dataframe schema!");
    return null;
  }

  return DataTypes.createStructType(fields);
}

Source File: TestEventTimeUpsertPlanner.java From envelope with Apache License 2.0

6 votes

@Before
public void before() { 
  arriving = Lists.newArrayList();
  existing = Lists.newArrayList();

  keySchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false)));
  recordSchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false),
    DataTypes.createStructField("value", DataTypes.StringType, true),
    DataTypes.createStructField("timestamp", DataTypes.LongType, true)));

  configMap = Maps.newHashMap();
  configMap.put(EventTimeUpsertPlanner.KEY_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("key"));
  configMap.put(EventTimeUpsertPlanner.VALUE_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("value"));
  configMap.put(EventTimeUpsertPlanner.TIMESTAMP_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("timestamp"));
  config = ConfigFactory.parseMap(configMap);
}

Source File: TestAvroUtils.java From envelope with Apache License 2.0

6 votes

@Test
public void toTypeSchemaStructTypeFieldNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.createStructType(
      Lists.newArrayList(
          DataTypes.createStructField("field1", DataTypes.StringType, true)
      )),
      false);

  assertEquals("Invalid type", Schema.Type.RECORD, schema.getType());
  assertEquals("Invalid record name", "record0", schema.getName());
  assertEquals("Invalid field count", 1, schema.getFields().size());
  assertEquals("Invalid field name", "field1", schema.getFields().get(0).name());
  assertEquals("Invalid field type", Schema.Type.UNION, schema.getFields().get(0).schema().getType());

  for (Schema s : schema.getFields().get(0).schema().getTypes()) {
    assertThat("Invalid union types", s.getType(), anyOf(is(Schema.Type.STRING), is(Schema.Type.NULL)));
  }

  //System.out.println(schema.toString(true));
}

Source File: TestFlatteningTransformer.java From hudi with Apache License 2.0

6 votes

@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumDMLDoublesWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 450.0");
	ml.execute(script);
}

Source File: AtomInteraction.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Returns the schema for a row of pairwise atom interactions. 
 * The schema is used to create a Dataset<Row> from the row information.
 * 
 * @return schema for dataset
 */
public static StructType getPairInteractionSchema() {
	int length = InteractionCenter.getLength();
	StructField[] sf = new StructField[2 * length + 2];

	int index = 0;
	sf[index++] = DataTypes.createStructField("pdbId", DataTypes.StringType, false);

	// copy schema info for query atom
	System.arraycopy(InteractionCenter.getStructFields(0), 0, sf, index, length);
	index += length;

	// copy schema info for interacting atoms and their distance
	System.arraycopy(InteractionCenter.getStructFields(1), 0, sf, index, length);
	index += length;
	sf[index++] = DataTypes.createStructField("distance1", DataTypes.FloatType, true);

	return new StructType(sf);
}

Source File: TestProtobufTranslator.java From envelope with Apache License 2.0

6 votes

@Test
public void translateMultiple() throws Exception {
  String descPath = TestProtobufTranslator.class.getResource(MULTIPLE_EXAMPLE).getPath();

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "protobuf");
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + 
      ProtobufSchema.DESCRIPTOR_FILEPATH_CONFIG, descPath);
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." +
      ProtobufSchema.DESCRIPTOR_MESSAGE_CONFIG, "OtherExample");
  Config config = ConfigFactory.parseMap(configMap);

  ProtobufTranslator translator = new ProtobufTranslator();
  assertNoValidationFailures(translator, config);
  translator.configure(config);

  byte[] key = "foo".getBytes();
  byte[] payload = Files.readAllBytes(MULTIPLE_UNCOMPRESSED.toPath());

  Row raw = TestingMessageFactory.get(key, DataTypes.BinaryType, payload, DataTypes.BinaryType);
  Iterable<Row> results = translator.translate(raw);

  assertThat(results.iterator().hasNext(), is(true));
  Row row = results.iterator().next();
  assertThat(row.getString(0), is("other"));
}

Source File: TypeCastStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

private DataType mapDataType(List<StructField> datasetFields, String column, String typeConfig) {

        DataType currentDatatype = getCurrentDataType(datasetFields, column);

        // when typeConfig is null (no config for this column), return the current DataType
        if(typeConfig == null) {
            return currentDatatype;
        }

        switch (typeConfig) {
            case "integer":
                return DataTypes.IntegerType;
            case "long":
                return DataTypes.LongType;
            case "double":
                return DataTypes.DoubleType;
            case "boolean":
                return DataTypes.BooleanType;
            case "date":
                return DataTypes.DateType;
            case "timestamp":
                return DataTypes.TimestampType;
            default:
                return DataTypes.StringType;
        }
    }

Source File: TestRowUtils.java From envelope with Apache License 2.0

6 votes

@Test
public void testToRowValueDate() {
  DataType field = DataTypes.DateType;

  DateTime dateObj = DateTime.parse("2017-01-01T00:00:00"); // Pass-thru the TZ
  Date sqlDate = new Date(dateObj.getMillis());

  assertEquals("Invalid Long", sqlDate, RowUtils.toRowValue(dateObj.getMillis(), field));
  assertEquals("Invalid String", sqlDate, RowUtils.toRowValue("2017-001", field)); // ISO Date format
  assertEquals("Invalid Date", sqlDate, RowUtils.toRowValue(dateObj.toDate(), field));
  assertEquals("Invalid DateTime", sqlDate, RowUtils.toRowValue(dateObj, field));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage(CoreMatchers.containsString("Invalid or unrecognized input format"));
  RowUtils.toRowValue(123, field);
}

Source File: TestConfigurationDataTypes.java From envelope with Apache License 2.0

6 votes

@Test
public void testGetSparkDataTypeValid() {
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DECIMAL), new DecimalType());
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal(38,38)"), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal ( 38 , 38 ) "), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.STRING), DataTypes.StringType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.FLOAT), DataTypes.FloatType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DOUBLE), DataTypes.DoubleType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BYTE), DataTypes.ByteType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.SHORT), DataTypes.ShortType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.INT), DataTypes.IntegerType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.LONG), DataTypes.LongType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BOOLEAN), DataTypes.BooleanType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BINARY), DataTypes.BinaryType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DATE), DataTypes.DateType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.TIMESTAMP), DataTypes.TimestampType);
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}

Source File: TestDelimitedSerializer.java From envelope with Apache License 2.0

6 votes

@Test
public void testDelimitedSerialization() {
  List<StructField> fields = Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.StringType, true),
      DataTypes.createStructField("field2", DataTypes.IntegerType, true),
      DataTypes.createStructField("field3", DataTypes.BooleanType, true)
  );
  Row row = new RowWithSchema(DataTypes.createStructType(fields), "hello", 1, false);
  
  Map<String, String> configs = Maps.newHashMap();
  configs.put(DelimitedSerializer.FIELD_DELIMITER_CONFIG_NAME, "||");
  Serializer<Row> serializer = new DelimitedSerializer();
  serializer.configure(configs, false);
  
  byte[] serialized = serializer.serialize("test", row);
  serializer.close();
  
  assertEquals(new String(serialized), "hello||1||false");
}

Source File: TestMorphlineTranslator.java From envelope with Apache License 2.0

6 votes

@Test (expected = MorphlineRuntimeException.class)
public void noRecordReturned() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "no-return");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType, 
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}

Source File: TestAvroUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void toTypeSchemaStringNotNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.StringType, false);

  assertEquals("Invalid type", Schema.Type.STRING, schema.getType());

  //System.out.println(schema.toString(true));
}

Source File: TestDelimitedTranslator.java From envelope with Apache License 2.0

5 votes

@Test
public void testNullMissing() {
  String delimited = "val1 2 34";
  
  Config config = ConfigFactory.empty()
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME,
          ConfigValueFactory.fromAnyRef("flat"))
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
          ConfigValueFactory.fromIterable(
              Lists.newArrayList("field1", "field2", "field3", "field4", "field5")))
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
          ConfigValueFactory.fromIterable(
              Lists.newArrayList("string", "int", "long", "int", "boolean")))
      .withValue(DelimitedTranslator.DELIMITER_CONFIG_NAME, ConfigValueFactory.fromAnyRef(" "));

  DelimitedTranslator t = new DelimitedTranslator();
  assertNoValidationFailures(t, config);
  t.configure(config);
  Row raw = TestingMessageFactory.get("testkey", DataTypes.StringType, delimited, DataTypes.StringType);
  Row r = t.translate(raw).iterator().next();
  assertEquals(r.length(), 5);
  assertEquals(r.get(0), "val1");
  assertEquals(r.get(1), 2);
  assertEquals(r.get(2), 34L);
  assertEquals(r.get(3), null);
  assertEquals(r.get(4), null);
}

Source File: TestAvroUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void toTypeSchemaStringNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.StringType);

  assertEquals("Invalid type", Schema.Type.UNION, schema.getType());
  assertEquals("Invalid union size", 2, schema.getTypes().size());

  for (Schema s : schema.getTypes()) {
    assertThat("Invalid union types", s.getType(), anyOf(is(Schema.Type.STRING), is(Schema.Type.NULL)));
  }

  //System.out.println(schema.toString(true));
}

Source File: TestRowUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void testToRowValueTimestamp() {
  DataType field = DataTypes.TimestampType;

  DateTime dateObj = DateTime.parse("2017-01-01T00:00:00"); // Pass-thru the TZ
  Timestamp sqlTimestamp = new Timestamp(dateObj.getMillis());

  assertEquals("Invalid Long", sqlTimestamp, RowUtils.toRowValue(dateObj.getMillis(), field));
  assertEquals("Invalid String", sqlTimestamp, RowUtils.toRowValue("2017-001", field)); // ISO Date format
  assertEquals("Invalid Date", sqlTimestamp, RowUtils.toRowValue(dateObj.toDate(), field));
  assertEquals("Invalid DateTime", sqlTimestamp, RowUtils.toRowValue(dateObj, field));

  // Test custom timestamp format parsing
  Map<RowUtils.RowValueMetadata, Object> metadataNull = Maps.newHashMap();
  Map<RowUtils.RowValueMetadata, Object> metadataEmpty = Maps.newHashMap();
  Map<RowUtils.RowValueMetadata, Object> metadataFormat = Maps.newHashMap();
  Set<String> empty = Sets.newHashSet();
  Set<String> formats = Sets.newHashSet();
  formats.add("yyyy-MM-dd HH:mm:ss.SSSSS");
  metadataNull.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, null);
  metadataEmpty.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, empty);
  metadataFormat.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, formats);
  assertEquals("Invalid null metadata", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, null));
  assertEquals("Invalid null format set in metadata", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, metadataNull));
  assertEquals("Invalid format set", sqlTimestamp, RowUtils.toRowValue("2017-01-01 00:00:00.00000", field, metadataFormat));
  assertEquals("Invalid empty format set", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, metadataEmpty));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage(CoreMatchers.containsString("Invalid or unrecognized input format"));
  RowUtils.toRowValue(123, field);
}

Source File: TestInListDeriver.java From envelope with Apache License 2.0

5 votes

private static StructType createTestSchema() {
  return DataTypes.createStructType(Arrays.asList(
      DataTypes.createStructField("id", DataTypes.StringType, true),
      DataTypes.createStructField("descr", DataTypes.StringType, true),
      DataTypes.createStructField("value", DataTypes.IntegerType, true),
      DataTypes.createStructField("vdate", DataTypes.DateType, true))
  );
}

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

org.apache.spark.sql.types.DataTypes Java Examples