org.apache.spark.sql.types.StructType#fields

Source File: MLContextUtil.java From systemds with Apache License 2.0

7 votes

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}

Source File: UnaryTransformer.java From ambiverse-nlu with Apache License 2.0

6 votes

@Override
public StructType transformSchema(StructType structType) {
    String inputCol = getInputCol();
    String outputCol = getOutputCol();
    DataType inputType = structType.apply(inputCol).dataType();
    this.validateInputType(inputType);
    List<String> names = Arrays.asList(structType.fieldNames());
    Cond.require(!names.contains(outputCol), "The output column " + outputCol + " already exists in this schema!");
    List<StructField> fields = new ArrayList<>();
    for (int i = 0; i < structType.fields().length; i++) {
        fields.add(structType.fields()[i]);
    }
    DataType dt = getOutputDataType();
    fields.add(DataTypes.createStructField(outputCol, dt, isOutputDataTypeNullable()));
    return DataTypes.createStructType(fields);
}

Source File: DBClientWrapper.java From spark-data-sources with MIT License

6 votes

public static edb.common.Row sparkToDBRow(org.apache.spark.sql.Row row, StructType type) {
    edb.common.Row dbRow = new edb.common.Row();
    StructField[] fields = type.fields();
    for (int i = 0; i < type.size(); i++) {
        StructField sf = fields[i];
        if (sf.dataType() == DataTypes.StringType) {
            dbRow.addField(new edb.common.Row.StringField(sf.name(), row.getString(i)));
        } else if (sf.dataType() == DataTypes.DoubleType) {
            dbRow.addField(new edb.common.Row.DoubleField(sf.name(), row.getDouble(i)));
        } else if (sf.dataType() == DataTypes.LongType) {
            dbRow.addField(new edb.common.Row.Int64Field(sf.name(), row.getLong(i)));
        } else {
            // TODO: type leakage
        }
    }

    return dbRow;
}

Source File: Reader.java From iceberg with Apache License 2.0

6 votes

PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
  StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
  StructField[] fields = partitionType.fields();

  this.types = new DataType[fields.length];
  this.positions = new int[types.length];
  this.javaTypes = new Class<?>[types.length];
  this.reusedRow = new GenericInternalRow(types.length);

  List<PartitionField> partitionFields = spec.fields();
  for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
    this.types[rowIndex] = fields[rowIndex].dataType();

    int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
    for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
      PartitionField field = spec.fields().get(specIndex);
      if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
        positions[rowIndex] = specIndex;
        javaTypes[rowIndex] = spec.javaClasses()[specIndex];
        break;
      }
    }
  }
}

Source File: MLContextUtil.java From systemds with Apache License 2.0

6 votes

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}

Source File: SchemaConverter.java From geowave with Apache License 2.0

6 votes

public static SimpleFeatureType schemaToFeatureType(
    final StructType schema,
    final String typeName) {
  final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder();
  typeBuilder.setName(typeName);
  typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE);
  try {
    typeBuilder.setCRS(CRS.decode("EPSG:4326", true));
  } catch (final FactoryException e) {
    LOGGER.error(e.getMessage(), e);
  }

  final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder();

  for (final StructField field : schema.fields()) {
    final AttributeDescriptor attrDesc = attrDescFromStructField(attrBuilder, field);

    typeBuilder.add(attrDesc);
  }

  return typeBuilder.buildFeatureType();
}

Source File: ExternalTableUtils.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void setPartitionColumnTypes (StructType dataSchema,int[] baseColumnMap, StructType tableSchema){

            int ncolumns = dataSchema.fields().length;
            int nPartitions = baseColumnMap.length;
            for (int i = 0; i < baseColumnMap.length; ++i) {
                String name = dataSchema.fields()[ncolumns - i - 1].name();
                org.apache.spark.sql.types.DataType type = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].dataType();
                boolean nullable = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].nullable();
                Metadata metadata = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].metadata();
                StructField field = new StructField(name, type, nullable, metadata);
                dataSchema.fields()[ncolumns - i - 1] = field;
            }
        }

Source File: ColumnUtils.java From net.jgp.labs.spark with Apache License 2.0

5 votes

public static Metadata getMetadata(Dataset<Row> df, String colName) {
  StructType schema = df.schema();
  StructField[] fields = schema.fields();
  for (StructField field : fields) {
    // TODO check on case
    if (field.name().compareTo(colName) == 0) {
      return field.metadata();
    }
  }
  return null;
}

Source File: UnsafeFixedWidthAggregationMap.java From indexr with Apache License 2.0

5 votes

/**
 * @return true if UnsafeFixedWidthAggregationMap supports aggregation buffers with the given
 * schema, false otherwise.
 */
public static boolean supportsAggregationBufferSchema(StructType schema) {
    for (StructField field : schema.fields()) {
        if (!UnsafeRow.isMutable(field.dataType())) {
            return false;
        }
    }
    return true;
}

Source File: DataFrames.java From DataVec with Apache License 2.0

5 votes

/**
 * Create a datavec schema
 * from a struct type
 *
 * @param structType the struct type to create the schema from
 * @return the created schema
 */
public static Schema fromStructType(StructType structType) {
    Schema.Builder builder = new Schema.Builder();
    StructField[] fields = structType.fields();
    String[] fieldNames = structType.fieldNames();
    for (int i = 0; i < fields.length; i++) {
        String name = fields[i].dataType().typeName().toLowerCase();
        switch (name) {
            case "double":
                builder.addColumnDouble(fieldNames[i]);
                break;
            case "float":
                builder.addColumnFloat(fieldNames[i]);
                break;
            case "long":
                builder.addColumnLong(fieldNames[i]);
                break;
            case "int":
            case "integer":
                builder.addColumnInteger(fieldNames[i]);
                break;
            case "string":
                builder.addColumnString(fieldNames[i]);
                break;
            default:
                throw new RuntimeException("Unknown type: " + name);
        }
    }

    return builder.build();
}

Source File: Reader.java From iceberg with Apache License 2.0

5 votes

StructLikeInternalRow(StructType struct) {
  this.types = new DataType[struct.size()];
  StructField[] fields = struct.fields();
  for (int i = 0; i < fields.length; i += 1) {
    types[i] = fields[i].dataType();
  }
}

Source File: MLContextTest.java From systemds with Apache License 2.0

5 votes

@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

/**
 * Obtain column vector from DataFrame schema
 * 
 * @param dfschema schema as StructType
 * @param containsID if true, contains ID column
 * @return 0-based column index of vector column, -1 if no vector.
 */
private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) {
	int off = containsID ? 1 : 0;
	for( int i=off; i<dfschema.fields().length; i++ ) {
		StructField structType = dfschema.apply(i);
		if(structType.dataType() instanceof VectorUDT)
			return i-off;
	}
	
	return -1;
}

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

5 votes

/**
 * If the MatrixFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper MatrixFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata, if available
 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
	if (matrixMetadata == null) {
		return;
	}
	MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
	if (matrixFormat != null) {
		return;
	}
	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	StructField[] fields = schema.fields();
	MatrixFormat mf = null;
	if (hasID) {
		if (fields[1].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
		} else {
			mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
		}
	} else {
		if (fields[0].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR;
		} else {
			mf = MatrixFormat.DF_DOUBLES;
		}
	}

	if (mf == null) {
		throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat");
	}
	matrixMetadata.setMatrixFormat(mf);
}

Source File: DBClientWrapper.java From spark-data-sources with MIT License

5 votes

public static Schema sparkToDbSchema(StructType st) {
    Schema schema = new Schema();
    for (StructField sf: st.fields()) {
        if (sf.dataType() == DataTypes.StringType) {
            schema.addColumn(sf.name(), Schema.ColumnType.STRING);
        } else if (sf.dataType() == DataTypes.DoubleType) {
            schema.addColumn(sf.name(), Schema.ColumnType.DOUBLE);
        } else if (sf.dataType() == DataTypes.LongType) {
            schema.addColumn(sf.name(), Schema.ColumnType.INT64);
        } else {
            // TODO: type leakage
        }
    }
    return schema;
}

Source File: ExternalTableUtils.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void checkSchema(StructType tableSchema,
                               StructType dataSchema,
                               int[] partitionColumnMap,
                               String location) throws StandardException{


    StructField[] tableFields = tableSchema.fields();
    StructField[] dataFields = dataSchema.fields();

    if (tableFields.length != dataFields.length) {
        throw StandardException.newException(SQLState.INCONSISTENT_NUMBER_OF_ATTRIBUTE,
                tableFields.length, dataFields.length, location);
    }

    StructField[] partitionedTableFields = new StructField[tableSchema.fields().length];
    Set<Integer> partitionColumns = new HashSet<>();
    for (int pos : partitionColumnMap) {
        partitionColumns.add(pos);
    }
    int index = 0;
    for (int i = 0; i < tableFields.length; ++i) {
        if (!partitionColumns.contains(i)) {
            partitionedTableFields[index++] = tableFields[i];
        }
    }

    for (int i = 0; i < tableFields.length - partitionColumnMap.length; ++i) {

        String tableFiledTypeName = partitionedTableFields[i].dataType().typeName();
        String dataFieldTypeName = dataFields[i].dataType().typeName();
        if (!tableFiledTypeName.equals(dataFieldTypeName)){
            throw StandardException.newException(SQLState.INCONSISTENT_DATATYPE_ATTRIBUTES,
                    tableFields[i].name(),
                    tableFields[i].dataType().toString(),
                    dataFields[i].name(),
                    dataFields[i].dataType().toString(),location);
        }
    }
}

Source File: SqlResultsWriter.java From geowave with Apache License 2.0

4 votes

public void writeResults(String typeName) {
  if (typeName == null) {
    typeName = DEFAULT_TYPE_NAME;
    LOGGER.warn(
        "Using default type name (adapter id): '" + DEFAULT_TYPE_NAME + "' for SQL output");
  }

  final StructType schema = results.schema();
  final SimpleFeatureType featureType = SchemaConverter.schemaToFeatureType(schema, typeName);

  final SimpleFeatureBuilder sfBuilder = new SimpleFeatureBuilder(featureType);

  final FeatureDataAdapter featureAdapter = new FeatureDataAdapter(featureType);

  final DataStore featureStore = outputDataStore.createDataStore();
  final Index featureIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  featureStore.addType(featureAdapter, featureIndex);
  try (Writer writer = featureStore.createWriter(featureAdapter.getTypeName())) {

    final List<Row> rows = results.collectAsList();

    for (int r = 0; r < rows.size(); r++) {
      final Row row = rows.get(r);

      for (int i = 0; i < schema.fields().length; i++) {
        final StructField field = schema.apply(i);
        final Object rowObj = row.apply(i);
        if (rowObj != null) {
          if (field.name().equals("geom")) {
            final Geometry geom = (Geometry) rowObj;

            sfBuilder.set("geom", geom);
          } else if (field.dataType() == DataTypes.TimestampType) {
            final long millis = ((Timestamp) rowObj).getTime();
            final Date date = new Date(millis);

            sfBuilder.set(field.name(), date);
          } else {
            sfBuilder.set(field.name(), rowObj);
          }
        }
      }

      final SimpleFeature sf = sfBuilder.buildFeature("result-" + nf.format(r));

      writer.write(sf);
    }
  }
}

Source File: SparkTypeToType.java From iceberg with Apache License 2.0

4 votes

SparkTypeToType(StructType root) {
  this.root = root;
  // the root struct's fields use the first ids
  this.nextId = root.fields().length;
}

Source File: SchemaIntrospectionApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Array to Dataframe (Dataset<Row>)")
      .master("local")
      .getOrCreate();

  StructType schema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "id",
          DataTypes.IntegerType,
          false),
      DataTypes.createStructField(
          "value-s",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "value-d",
          DataTypes.DoubleType,
          false),
      DataTypes.createStructField(
          "array",
          DataTypes.createArrayType(DataTypes.StringType, false),
          false),
      DataTypes.createStructField(
          "struct",
          DataTypes.createStructType(new StructField[] {
              DataTypes.createStructField(
                  "sid",
                  DataTypes.IntegerType,
                  false),
              DataTypes.createStructField(
                  "svalue",
                  DataTypes.StringType,
                  false) }),
          false),
      DataTypes.createStructField(
          "array-struct",
          DataTypes.createArrayType(
              DataTypes.createStructType(new StructField[] {
                  DataTypes.createStructField(
                      "asid",
                      DataTypes.IntegerType,
                      false),
                  DataTypes.createStructField(
                      "asvalue",
                      DataTypes.StringType,
                      false) })),
          false) });

  List<Row> rows = new ArrayList<>();
  for (int x = 0; x < 10; x++) {
    List<Row> subrows = new ArrayList<>();
    for (int y = 1000; y < 1003; y++) {
      subrows.add(RowFactory.create(y, "Sub " + y));
    }
    Row str = RowFactory.create(x * 5000, "Struct #" + x);
    String[] array =
        new String[] { "v" + (x * 100), "v" + (x * 100 + 1) };
    rows.add(
        RowFactory.create(x, "Value " + x, x / 4.0, array, str, subrows));
  }

  Dataset<Row> df = spark.createDataFrame(rows, schema);
  df.show(false);
  df.printSchema();

  StructType readSchema = df.schema();
  String[] fieldNames = readSchema.fieldNames();
  int i = 0;
  for (String fieldName : fieldNames) {
    log.info("Field #{}: '{}'", i++, fieldName);
  }
  log.info("Catalog: '{}'", readSchema.catalogString());
  StructField[] fields = readSchema.fields();
  i = 0;
  for (StructField field : fields) {
    log.info("DDL for field #{}: '{}'", i++, field.toDDL());
  }
}

Source File: SparkRowConverterTest.java From bunsen with Apache License 2.0

3 votes

/**
 * Recursively walks the schema to ensure there are no struct fields that are empty.
 */
private void checkNoEmptyStructs(StructType schema, String fieldName) {

  Assert.assertNotEquals("Struct field " + fieldName + " is empty",
      0,
      schema.fields().length);

  for (StructField field : schema.fields()) {

    if (field.dataType() instanceof StructType) {

      checkNoEmptyStructs((StructType) field.dataType(), field.name());

    } else if (field.dataType() instanceof ArrayType) {

      ArrayType arrayType = (ArrayType) field.dataType();

      if (arrayType.elementType() instanceof StructType) {

        if (!field.name().equals("contained")) {

          checkNoEmptyStructs((StructType) arrayType.elementType(), field.name());
        }
      }
    }
  }
}

Java Code Examples for org.apache.spark.sql.types.StructType#fields()