org.apache.spark.util.SerializableConfiguration Java Examples
The following examples show how to use
org.apache.spark.util.SerializableConfiguration.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private Dataset<Row> buildActualFileDF() { List<String> subDirs = Lists.newArrayList(); List<String> matchingFiles = Lists.newArrayList(); Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles); JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1); if (subDirs.isEmpty()) { return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); } int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism); JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism); Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf); JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp)); JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path"); }
Example #2
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private static FlatMapFunction<Iterator<String>, String> listDirsRecursively( Broadcast<SerializableConfiguration> conf, long olderThanTimestamp) { return (FlatMapFunction<Iterator<String>, String>) dirs -> { List<String> subDirs = Lists.newArrayList(); List<String> files = Lists.newArrayList(); Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; int maxDepth = 2000; int maxDirectSubDirs = Integer.MAX_VALUE; dirs.forEachRemaining(dir -> { listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files); }); if (!subDirs.isEmpty()) { throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth); } return files.iterator(); }; }
Example #3
Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
RemoveOrphanFilesAction(SparkSession spark, Table table) { this.spark = spark; this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf()); this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism(); this.table = table; this.ops = ((HasTableOperations) table).operations(); this.location = table.location(); }
Example #4
Source File: SparkUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopFileIO) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization SerializableConfiguration conf = new SerializableConfiguration(((HadoopFileIO) table.io()).conf()); return new HadoopFileIO(conf::value); } else { return table.io(); } }
Example #5
Source File: Writer.java From iceberg with Apache License 2.0 | 5 votes |
WriterFactory(PartitionSpec spec, FileFormat format, String dataLocation, Map<String, String> properties, Configuration conf) { this.spec = spec; this.format = format; this.dataLocation = dataLocation; this.properties = properties; this.conf = new SerializableConfiguration(conf); }
Example #6
Source File: Reader.java From iceberg with Apache License 2.0 | 5 votes |
private ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString, SerializableConfiguration conf) { this.task = task; this.tableSchemaString = tableSchemaString; this.expectedSchemaString = expectedSchemaString; this.conf = conf; }
Example #7
Source File: GryoSerializer.java From tinkerpop with Apache License 2.0 | 5 votes |
private SparkIoRegistry() { try { super.register(GryoIo.class, Tuple2.class, new Tuple2Serializer()); super.register(GryoIo.class, Tuple2[].class, null); super.register(GryoIo.class, Tuple3.class, new Tuple3Serializer()); super.register(GryoIo.class, Tuple3[].class, null); super.register(GryoIo.class, CompactBuffer.class, new CompactBufferSerializer()); super.register(GryoIo.class, CompactBuffer[].class, null); super.register(GryoIo.class, CompressedMapStatus.class, null); super.register(GryoIo.class, BlockManagerId.class, null); super.register(GryoIo.class, HighlyCompressedMapStatus.class, new ExternalizableSerializer()); // externalizable implemented so its okay super.register(GryoIo.class, TorrentBroadcast.class, null); super.register(GryoIo.class, PythonBroadcast.class, null); super.register(GryoIo.class, BoxedUnit.class, null); super.register(GryoIo.class, Class.forName("scala.reflect.ClassTag$$anon$1"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.reflect.ManifestFactory$$anon$1"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("org.apache.spark.internal.io.FileCommitProtocol$TaskCommitMessage"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("org.apache.spark.internal.io.FileCommitProtocol$EmptyTaskCommitMessage$"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.collection.immutable.Map$EmptyMap$"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.collection.immutable.Map"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.None$"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.Some$"), new JavaSerializer()); super.register(GryoIo.class, Class.forName("scala.Some"), new JavaSerializer()); super.register(GryoIo.class, WrappedArray.ofRef.class, new WrappedArraySerializer()); super.register(GryoIo.class, MessagePayload.class, null); super.register(GryoIo.class, ViewIncomingPayload.class, null); super.register(GryoIo.class, ViewOutgoingPayload.class, null); super.register(GryoIo.class, ViewPayload.class, null); super.register(GryoIo.class, SerializableConfiguration.class, new JavaSerializer()); super.register(GryoIo.class, VertexWritable.class, new VertexWritableSerializer()); super.register(GryoIo.class, ObjectWritable.class, new ObjectWritableSerializer()); } catch (final ClassNotFoundException e) { throw new IllegalStateException(e); } }
Example #8
Source File: Reader.java From iceberg with Apache License 2.0 | 4 votes |
Reader(Table table, Configuration conf) { this.table = table; this.conf = new SerializableConfiguration(conf); this.schema = table.schema(); }