org.apache.spark.Partition Java Examples
The following examples show how to use
org.apache.spark.Partition.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDatasetBoundedSourceVertex.java From incubator-nemo with Apache License 2.0 | 6 votes |
/** * Constructor. * * @param sparkSession sparkSession to recreate on each executor. * @param dataset Dataset to read data from. */ public SparkDatasetBoundedSourceVertex(final SparkSession sparkSession, final Dataset<T> dataset) { this.readables = new ArrayList<>(); final RDD rdd = dataset.sparkRDD(); final Partition[] partitions = rdd.getPartitions(); for (int i = 0; i < partitions.length; i++) { readables.add(new SparkDatasetBoundedSourceReadable( partitions[i], sparkSession.getDatasetCommandsList(), sparkSession.getInitialConf(), i)); } this.estimatedByteSize = dataset.javaRDD() .map(o -> (long) o.toString().getBytes("UTF-8").length) .reduce((a, b) -> a + b); }
Example #2
Source File: GenericHadoopExtractor.java From deep-spark with Apache License 2.0 | 6 votes |
@Override public void initIterator(Partition dp, S config) { int id = config.getRddId(); NewHadoopPartition split = (NewHadoopPartition) dp; TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, split.index(), 0); Configuration configuration = getHadoopConfig(config); TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext); reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } }
Example #3
Source File: ExtractorClientHandler.java From deep-spark with Apache License 2.0 | 6 votes |
@Override public Partition[] getPartitions(ExtractorConfig<T> config) { GetPartitionsAction<T> getPartitionsAction = new GetPartitionsAction<>(config); channel.writeAndFlush(getPartitionsAction); Response response; boolean interrupted = false; for (; ; ) { try { response = answer.take(); break; } catch (InterruptedException ignore) { interrupted = true; } } if (interrupted) { Thread.currentThread().interrupt(); } return ((GetPartitionsResponse) response).getPartitions(); }
Example #4
Source File: ExtractorClientHandler.java From deep-spark with Apache License 2.0 | 6 votes |
@Override public void initIterator(Partition dp, ExtractorConfig<T> config) { InitIteratorAction<T> initIteratorAction = new InitIteratorAction<>(dp, config); channel.writeAndFlush(initIteratorAction); Response response; boolean interrupted = false; for (; ; ) { try { response = answer.take(); break; } catch (InterruptedException ignore) { interrupted = true; } } if (interrupted) { Thread.currentThread().interrupt(); } return; }
Example #5
Source File: JdbcNativeExtractor.java From deep-spark with Apache License 2.0 | 6 votes |
/** * {@inheritDoc} */ @Override public Partition[] getPartitions(S config) { jdbcDeepJobConfig = initConfig(config, jdbcDeepJobConfig); int upperBound = jdbcDeepJobConfig.getUpperBound(); int lowerBound = jdbcDeepJobConfig.getLowerBound(); int numPartitions = jdbcDeepJobConfig.getNumPartitions(); int length = 1 + upperBound - lowerBound; Partition [] result = new Partition[numPartitions]; for(int i=0; i<numPartitions; i++) { int start = lowerBound + lowerBound + ((i * length) / numPartitions); int end = lowerBound + (((i + 1) * length) / numPartitions) - 1; result[i] = new JdbcPartition(i, start, end); } return result; }
Example #6
Source File: JdbcReader.java From deep-spark with Apache License 2.0 | 6 votes |
/** * Initialized the reader * * @param p * Spark partition. * @throws Exception */ public void init(Partition p) throws Exception { Class.forName(jdbcDeepJobConfig.getDriverClass()); conn = DriverManager.getConnection(jdbcDeepJobConfig.getConnectionUrl(), jdbcDeepJobConfig.getUsername(), jdbcDeepJobConfig.getPassword()); Statement statement = conn.createStatement(); SelectQuery query = jdbcDeepJobConfig.getQuery(); JdbcPartition jdbcPartition = (JdbcPartition)p; if(jdbcDeepJobConfig.getNumPartitions() > 1) { Column partitionKey = jdbcDeepJobConfig.getPartitionKey(); query.getWhereClause().addCondition(BinaryCondition.lessThan(partitionKey, jdbcPartition.upper(), true)) .addCondition(BinaryCondition.greaterThan(partitionKey, jdbcPartition.lower(), true)); } resultSet = statement.executeQuery(query.toString()); // Fetches first element this.hasNext = resultSet.next(); }
Example #7
Source File: RangePartitionCoalescer.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override public PartitionGroup[] coalesce(int maxPartitions, RDD<?> parent) { if (maxPartitions != parent.getNumPartitions()) { throw new IllegalArgumentException("Cannot use " + getClass().getSimpleName() + " with a different number of partitions to the parent RDD."); } List<Partition> partitions = Arrays.asList(parent.getPartitions()); PartitionGroup[] groups = new PartitionGroup[partitions.size()]; for (int i = 0; i < partitions.size(); i++) { Seq<String> preferredLocations = parent.getPreferredLocations(partitions.get(i)); scala.Option<String> preferredLocation = scala.Option.apply (preferredLocations.isEmpty() ? null : preferredLocations.apply(0)); PartitionGroup group = new PartitionGroup(preferredLocation); List<Partition> partitionsInGroup = partitions.subList(i, maxEndPartitionIndexes.get(i) + 1); group.partitions().append(JavaConversions.asScalaBuffer(partitionsInGroup)); groups[i] = group; } return groups; }
Example #8
Source File: SparkTextFileBoundedSourceVertex.java From incubator-nemo with Apache License 2.0 | 6 votes |
/** * Constructor. * * @param sparkContext the spark context. * @param inputPath the path of the target text file. * @param numPartitions the number of partitions. */ public SparkTextFileBoundedSourceVertex(final SparkContext sparkContext, final String inputPath, final int numPartitions) { this.readables = new ArrayList<>(); final Partition[] partitions = sparkContext.textFile(inputPath, numPartitions).getPartitions(); for (int i = 0; i < partitions.length; i++) { readables.add(new SparkTextFileBoundedSourceReadable( partitions[i], sparkContext.getConf(), i, inputPath, numPartitions)); } this.estimatedSizeBytes = SizeEstimator.estimate(sparkContext.textFile(inputPath, numPartitions)); }
Example #9
Source File: CassandraExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
@Override public void initIterator(final Partition dp, S config) { cassandraJobConfig = initConfig(config, cassandraJobConfig); recordReader = initRecordReader((DeepPartition) dp, cassandraJobConfig); }
Example #10
Source File: JdbcNativeExtractorTest.java From deep-spark with Apache License 2.0 | 5 votes |
@Test public void testPartitions() { JdbcNativeExtractor extractor = createJdbcNativeExtractor(); Partition [] partitions = extractor.getPartitions(createJdbcDeepJobConfig()); assertEquals(partitions.length, NUM_PARTITIONS); JdbcPartition partition0 = (JdbcPartition)partitions[0]; assertEquals(partition0.index(), 0); assertEquals(partition0.lower(), 0); assertEquals(partition0.upper(), Integer.MAX_VALUE - 1); }
Example #11
Source File: JdbcNeo4JReader.java From deep-spark with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ @Override public void init(Partition p) throws Exception { Class.forName(Driver.class.getCanonicalName()); conn = DriverManager.getConnection(jdbcNeo4JDeepJobConfig.getConnectionUrl(), jdbcNeo4JDeepJobConfig.getUsername(), jdbcNeo4JDeepJobConfig.getPassword()); Statement statement = conn.createStatement(); String query = jdbcNeo4JDeepJobConfig.getCypherQuery(); resultSet = statement.executeQuery(query); // Fetches first element this.hasNext = resultSet.next(); }
Example #12
Source File: CassandraExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
/** * Returns the partitions on which this RDD depends on. * <p/> * Uses the underlying CqlPagingInputFormat in order to retrieve the splits. * <p/> * The number of splits, and hence the number of partitions equals to the number of tokens configured in * cassandra.yaml + 1. */ @Override public Partition[] getPartitions(S config) { cassandraJobConfig = initConfig(config, cassandraJobConfig); List<DeepTokenRange> underlyingInputSplits = null; if(isFilterdByKey(cassandraJobConfig.getFilters(), cassandraJobConfig.fetchTableMetadata().getPartitionKey() .get(0).getName())) { underlyingInputSplits = new ArrayList<>(); underlyingInputSplits.add(new DeepTokenRange(Long.MIN_VALUE, Long.MAX_VALUE, cassandraJobConfig.getHostList())); }else{ if (cassandraJobConfig.isBisectModeSet()) { underlyingInputSplits = RangeUtils.getSplits(cassandraJobConfig); } else { underlyingInputSplits = ThriftRangeUtils.build(cassandraJobConfig).getSplits(); } } Partition[] partitions = new DeepPartition[underlyingInputSplits.size()]; int i = 0; for (DeepTokenRange split : underlyingInputSplits) { partitions[i] = new DeepPartition(cassandraJobConfig.getRddId(), i, split); // log().debug("Detected partition: " + partitions[i]); ++i; } return partitions; }
Example #13
Source File: JdbcNativeExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ @Override public void initIterator(Partition dp, S config) { jdbcDeepJobConfig = initConfig(config, jdbcDeepJobConfig); this.jdbcReader = new JdbcReader(jdbcDeepJobConfig); try { this.jdbcReader.init(dp); } catch(Exception e) { throw new DeepGenericException("Unable to initialize JdbcReader", e); } }
Example #14
Source File: JdbcNeo4JNativeExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ @Override public void initIterator(Partition dp, S config) { jdbcNeo4JDeepJobConfig = initConfig(config, jdbcNeo4JDeepJobConfig); this.jdbcReader = new JdbcNeo4JReader(jdbcNeo4JDeepJobConfig); try { this.jdbcReader.init(dp); } catch(Exception e) { throw new DeepGenericException("Unable to initialize JdbcReader", e); } }
Example #15
Source File: JdbcNeo4JNativeExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ @Override public Partition[] getPartitions(S config){ JdbcNeo4JDeepJobConfig neo4jConfig = (JdbcNeo4JDeepJobConfig)config; JdbcPartition partition = new JdbcPartition(0, neo4jConfig.getLowerBound(), neo4jConfig.getUpperBound()); Partition [] result = new Partition[1]; result[0] = partition; return result; }
Example #16
Source File: MongoReader.java From deep-spark with Apache License 2.0 | 5 votes |
/** * Init void. * * @param partition the partition */ public void init(Partition partition) { try { List<ServerAddress> addressList = new ArrayList<>(); for (String s : (List<String>) ((DeepPartition) partition).splitWrapper().getReplicas()) { addressList.add(new ServerAddress(s)); } //Credentials List<MongoCredential> mongoCredentials = new ArrayList<>(); if (mongoDeepJobConfig.getUsername() != null && mongoDeepJobConfig.getPassword() != null) { MongoCredential credential = MongoCredential.createMongoCRCredential(mongoDeepJobConfig.getUsername(), mongoDeepJobConfig.getDatabase(), mongoDeepJobConfig.getPassword().toCharArray()); mongoCredentials.add(credential); } mongoClient = new MongoClient(addressList, mongoCredentials); mongoClient.setReadPreference(ReadPreference.valueOf(mongoDeepJobConfig.getReadPreference())); db = mongoClient.getDB(mongoDeepJobConfig.getDatabase()); collection = db.getCollection(mongoDeepJobConfig.getCollection()); dbCursor = collection.find(generateFilterQuery((MongoPartition) partition), mongoDeepJobConfig.getDBFields()); } catch (UnknownHostException e) { throw new DeepExtractorInitializationException(e); } }
Example #17
Source File: MongoNativeExtractor.java From deep-spark with Apache License 2.0 | 5 votes |
@Override public void initIterator(Partition dp, S config) { mongoDeepJobConfig = initConfig(config, mongoDeepJobConfig); reader = new MongoReader(mongoDeepJobConfig); reader.init(dp); }
Example #18
Source File: DeepRDD.java From deep-spark with Apache License 2.0 | 5 votes |
@Override public Iterator<T> compute(Partition split, TaskContext context) { initExtractorClient(); extractorClient.initIterator(split, config.getValue()); context.addTaskCompletionListener(new AbstractFunction1<TaskContext, BoxedUnit>() { @Override public BoxedUnit apply(TaskContext v1) { extractorClient.close(); return null; } }); java.util.Iterator<T> iterator = new java.util.Iterator<T>() { @Override public boolean hasNext() { return extractorClient.hasNext(); } @Override public T next() { return extractorClient.next(); } @Override public void remove() { throw new DeepIOException( "Method not implemented (and won't be implemented anytime soon!!!)"); } }; return new InterruptibleIterator<>(context, asScalaIterator(iterator)); }
Example #19
Source File: DeepRDD.java From deep-spark with Apache License 2.0 | 5 votes |
@Override public Seq<String> getPreferredLocations(Partition split) { initExtractorClient(); List<String> locations = extractorClient.getPreferredLocations(split); if (locations == null || locations.isEmpty()) { return super.getPreferredLocations(split); } return asScalaBuffer(locations); }
Example #20
Source File: HBasePartitioner.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override public void initialize() { List<Partition> partitions = Arrays.asList(((SparkDataSet) dataSet).rdd.rdd().partitions()); tableSplits = new ArrayList<>(partitions.size()); for (Partition p : partitions) { NewHadoopPartition nhp = (NewHadoopPartition) p; SMSplit sms = (SMSplit) nhp.serializableHadoopSplit().value(); TableSplit ts = sms.getSplit(); if (ts.getStartRow() != null && Bytes.equals(ts.getStartRow(),ts.getEndRow()) && ts.getStartRow().length > 0) { // this would be an empty partition, with the same start and end key, so don't add it continue; } tableSplits.add(ts); } }
Example #21
Source File: SparkDatasetBoundedSourceVertex.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Constructor. * * @param partition the partition to wrap. * @param commands list of commands needed to build the dataset. * @param sessionInitialConf spark session's initial configuration. * @param partitionIndex partition for this readable. */ private SparkDatasetBoundedSourceReadable(final Partition partition, final LinkedHashMap<String, Object[]> commands, final Map<String, String> sessionInitialConf, final int partitionIndex) { this.commands = commands; this.sessionInitialConf = sessionInitialConf; this.partitionIndex = partitionIndex; this.locations = SparkSourceUtil.getPartitionLocation(partition); }
Example #22
Source File: SparkTextFileBoundedSourceVertex.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Constructor. * * @param partition the partition to wrap. * @param sparkConf configuration needed to build the SparkContext. * @param partitionIndex partition for this readable. * @param inputPath the input file path. * @param numPartitions the total number of partitions. */ private SparkTextFileBoundedSourceReadable(final Partition partition, final SparkConf sparkConf, final int partitionIndex, final String inputPath, final int numPartitions) { this.sparkConf = sparkConf; this.partitionIndex = partitionIndex; this.inputPath = inputPath; this.numPartitions = numPartitions; this.locations = SparkSourceUtil.getPartitionLocation(partition); }
Example #23
Source File: ExtractorServerHandler.java From deep-spark with Apache License 2.0 | 5 votes |
protected Partition[] getPartitions(GetPartitionsAction<T> getPartitionsAction) { if (extractor == null) { this.initExtractor(getPartitionsAction.getConfig()); } return extractor.getPartitions(getPartitionsAction.getConfig()); }
Example #24
Source File: SparkSourceUtil.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Gets the source location of a Spark partition. * * @param partition the partition to get location. * @return a list of locations. * @throws RuntimeException if failed to get source location. */ static List<String> getPartitionLocation(final Partition partition) { try { if (partition instanceof HadoopPartition) { final Field inputSplitField = partition.getClass().getDeclaredField("inputSplit"); inputSplitField.setAccessible(true); final InputSplit inputSplit = (InputSplit) ((SerializableWritable) inputSplitField.get(partition)).value(); final String[] splitLocations = inputSplit.getLocations(); final List<String> parsedLocations = new ArrayList<>(); for (final String loc : splitLocations) { final String canonicalHostName = InetAddress.getByName(loc).getCanonicalHostName(); parsedLocations.add(canonicalHostName); } if (parsedLocations.size() == 1 && parsedLocations.get(0).equals("localhost")) { return Collections.emptyList(); } else { return parsedLocations; } } else { return Collections.emptyList(); } } catch (final Exception e) { throw new RuntimeException(e); } }
Example #25
Source File: SourceRDD.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<scala.Tuple2<Source<T>, CheckpointMarkT>> compute( Partition split, TaskContext context) { @SuppressWarnings("unchecked") CheckpointableSourcePartition<T, CheckpointMarkT> partition = (CheckpointableSourcePartition<T, CheckpointMarkT>) split; scala.Tuple2<Source<T>, CheckpointMarkT> tuple2 = new scala.Tuple2<>(partition.getSource(), partition.checkpointMark); return JavaConversions.asScalaIterator(Collections.singleton(tuple2).iterator()); }
Example #26
Source File: SourceRDD.java From beam with Apache License 2.0 | 5 votes |
@Override public Partition[] getPartitions() { try { final List<? extends Source<T>> partitionedSources = microbatchSource.split(options.get()); final Partition[] partitions = new CheckpointableSourcePartition[partitionedSources.size()]; for (int i = 0; i < partitionedSources.size(); i++) { partitions[i] = new CheckpointableSourcePartition<>( id(), i, partitionedSources.get(i), EmptyCheckpointMark.get()); } return partitions; } catch (Exception e) { throw new RuntimeException("Failed to create partitions.", e); } }
Example #27
Source File: MizoRDD.java From mizo with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<TReturn> compute(Partition split, TaskContext context) { String regionEdgesFamilyPath = this.regionsPaths.get(split.index()); log.info("Running Mizo on region #{} located at: {}", split.index(), regionEdgesFamilyPath); return createRegionIterator(createRegionRelationsIterator(regionEdgesFamilyPath)); }
Example #28
Source File: MizoRDD.java From mizo with Apache License 2.0 | 5 votes |
@Override public Partition[] getPartitions() { return Iterators.toArray(IntStream .range(0, this.regionsPaths.size()) .mapToObj(i -> (Partition) () -> i) .iterator(), Partition.class); }
Example #29
Source File: SourceRDD.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<WindowedValue<T>> compute( final Partition split, final TaskContext context) { final MetricsContainer metricsContainer = metricsAccum.value().getContainer(stepName); @SuppressWarnings("unchecked") final BoundedSource.BoundedReader<T> reader = createReader((SourcePartition<T>) split); final Iterator<WindowedValue<T>> readerIterator = new ReaderToIteratorAdapter<>(metricsContainer, reader); return new InterruptibleIterator<>(context, JavaConversions.asScalaIterator(readerIterator)); }
Example #30
Source File: MongoNativeExtractor.java From deep-spark with Apache License 2.0 | 4 votes |
@Override public List<String> getPreferredLocations(Partition split) { return removeAddressPort(((DeepPartition) split).splitWrapper().getReplicas()); }