Java Code Examples for org.apache.spark.sql.DataFrame#unpersist()
The following examples show how to use
org.apache.spark.sql.DataFrame#unpersist() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Persist predicate metadata table storing all predicates. */ public void writePredicateMetadata() { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(PREDICATE_ID, DataTypes.IntegerType, false)); fields.add(DataTypes.createStructField(PREDICATE_URI, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(PREDICATE_LABEL, DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_URI)); List<Tuple2<String, String>> primaryKeys = new ArrayList<>(); primaryKeys.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_ID)); final IndexMap<String> predicateIndex = rdfSchema.getPredicateIndex(); final Map<String, String> uriLabels = rdfSchema.getUriLabels(); // create table rows List<Row> rows = predicateIndex.getValues().stream() .map(uri -> { Object[] valueArray = new Object[]{ predicateIndex.getIndex(uri), uri, uriLabels.get(uri) }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); // create and write the META_Predicates dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(PREDICATES_TABLE_NAME, df); persistor.createPrimaryKeys(primaryKeys); persistor.createIndexes(indexes); df.unpersist(); }
Example 2
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Write metadata describing relation tables * * @param relationSchema the relation schema */ public void writeRelationMetadata(RelationSchema relationSchema) { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RELATIONS_NAME, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(RELATIONS_FROM_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(RELATIONS_TO_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(RELATIONS_PREDICATE_ID, DataTypes.IntegerType, true)); // create table rows List<Row> rows = relationSchema.getTables().stream() .map(table -> { RelationPredicateFilter predicateFilter = table.getPredicateFilter(); RelationEntityFilter entityFilter = table.getEntityFilter(); Object[] valueArray = new Object[]{ table.getName(), entityFilter == null ? null : entityFilter.getFromTypeName(), entityFilter == null ? null : entityFilter.getToTypeName(), predicateFilter == null ? null : rdfSchema.getPredicateIndex().getIndex(predicateFilter.getPredicateURI()) }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); StructType schema = DataTypes.createStructType(fields); // add index for each field List<Tuple2<String, String>> indexes = fields.stream() .map(field -> new Tuple2<>(RELATIONS_TABLE_NAME, field.name())) .collect(Collectors.toList()); // create and write the META_Relations dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(RELATIONS_TABLE_NAME, df); persistor.createIndexes(indexes); df.unpersist(); }
Example 3
Source File: MetadataWriter.java From rdf2x with Apache License 2.0 | 5 votes |
/** * Write metadata describing entity tables * * @param entitySchema the entity schema */ public void writeEntityMetadata(EntitySchema entitySchema) { // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false)); fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI)); List<Tuple2<String, String>> primaryKeys = new ArrayList<>(); indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME)); final Map<String, String> uriLabels = rdfSchema.getUriLabels(); // create table rows List<Row> rows = entitySchema.getTables().stream() .map(table -> { Object[] valueArray = new Object[]{ table.getName(), table.getTypeURI(), uriLabels.get(table.getTypeURI()), table.getNumRows() }; return RowFactory.create(valueArray); }).collect(Collectors.toList()); // create and write the META_Entities dataframe DataFrame df = sql.createDataFrame(rows, schema); persistor.writeDataFrame(ENTITIES_TABLE_NAME, df); persistor.createPrimaryKeys(primaryKeys); persistor.createIndexes(indexes); df.unpersist(); }
Example 4
Source File: InstanceRelationWriter.java From rdf2x with Apache License 2.0 | 4 votes |
/** * Persist the Entity Attribute Value table * * @param entitySchema entity schema * @param instances RDD of {@link Instance}s */ public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) { IndexMap<String> typeIndex = rdfSchema.getTypeIndex(); // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false)); fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false)); fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME)); // get map of type index -> set of attributes Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream() .collect(Collectors.toMap( table -> typeIndex.getIndex(table.getTypeURI()), table -> table.getAttributes().stream() .map(EntityProperty::getPredicate) .collect(Collectors.toSet()) )); // get all entity attribute values JavaRDD<Row> rowRDD = instances.flatMap(instance -> instance.getLiteralPredicates().stream() // filter predicates that are in the EAV set of at least one of the instance types .filter(predicate -> instance.getTypes().stream().anyMatch(type -> typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...) typeEavPredicates.get(type).contains(predicate) )) // map to row of values .flatMap(predicate -> { Object value = instance.getLiteralValue(predicate); if (value instanceof Set) { // return a row for each single value return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val)); } return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value) } ) .collect(Collectors.toList()) ); int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size)); // create and write the dataframe log.info("Writing EAV table of {} predicates", predicateCount); DataFrame df = sql.createDataFrame(rowRDD, schema); persistor.writeDataFrame(EAV_TABLE_NAME, df); log.info("Creating indexes for EAV table"); persistor.createIndexes(indexes); df.unpersist(); }