org.apache.spark.rdd.NewHadoopRDD Scala Examples
The following examples show how to use org.apache.spark.rdd.NewHadoopRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JavaNewHadoopRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 2
Source File: JavaNewHadoopRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 3
Source File: JavaNewHadoopRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 4
Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0 | 5 votes |
package com.github.saurfang.parquet.proto.spark import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat import com.google.protobuf.AbstractMessage import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.parquet.proto.ProtoReadSupport import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag class ProtoParquetRDD[T <: AbstractMessage : ClassTag]( sc: SparkContext, input: String, protoClass: Class[T], @transient conf: Configuration ) extends RDD[T](sc, Nil) { def this(sc: SparkContext, input: String, protoClass: Class[T]) = { this(sc, input, protoClass, sc.hadoopConfiguration) } lazy private[this] val rdd = { val jconf = new JobConf(conf) FileInputFormat.setInputPaths(jconf, input) ProtoReadSupport.setProtobufClass(jconf, protoClass.getName) new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf) } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2) override protected def getPartitions: Array[Partition] = rdd.getPartitions }
Example 5
Source File: JavaNewHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 6
Source File: JavaNewHadoopRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 7
Source File: HBaseSimpleRDD.scala From spark-hbase-connector with Apache License 2.0 | 5 votes |
package it.nerdammer.spark.hbase import it.nerdammer.spark.hbase.conversion.FieldReader import org.apache.hadoop.hbase.CellUtil import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, TaskContext} import scala.reflect.ClassTag class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0) (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) { override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions override def compute(split: Partition, context: TaskContext) = { // val cleanConversion = sc.clean ---> next version firstParent[(ImmutableBytesWritable, Result)].iterator(split, context) .map(e => conversion(e._1, e._2)) } def conversion(key: ImmutableBytesWritable, row: Result) = { val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns) val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames) val columns = columnNamesFC .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2))) .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None) .toList mapper.map(Some(key.get.drop(saltingLength)) :: columns) } }
Example 8
Source File: JavaNewHadoopRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 9
Source File: JavaNewHadoopRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 10
Source File: JavaNewHadoopRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 11
Source File: featureCounts.scala From bdg-sequila with Apache License 2.0 | 5 votes |
import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.apache.spark.SparkContext import org.apache.spark.rdd.NewHadoopRDD import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.rangejoins.common.metrics.MetricsCollector import org.seqdoop.hadoop_bam.{BAMInputFormat, FileVirtualSplit, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader val metricsTable = "granges.metrics" sc.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) case class PosRecord(contigName:String,start:Int,end:Int) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil val alignments = sc.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]("/data/granges/NA12878.ga2.exome.maq.recal.bam").map(_._2.get).map(r=>PosRecord(r.getContig,r.getStart,r.getEnd)) val reads=alignments.toDF reads.createOrReplaceTempView("reads") val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam") targets.createOrReplaceTempView("targets") val query=""" SELECT targets.contigName,targets.start,targets.end,count(*) FROM reads JOIN targets | ON (targets.contigName=reads.contigName | AND | CAST(reads.end AS INTEGER)>=CAST(targets.start AS INTEGER) | AND | CAST(reads.start AS INTEGER)<=CAST(targets.end AS INTEGER) | ) | GROUP BY targets.contigName,targets.start,targets.end""" spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (100 *1024*1024).toString) val mc = new MetricsCollector(spark,metricsTable) mc.initMetricsTable mc.runAndCollectMetrics( "q_featurecounts_bam_wes", "spark_granges_it_bc_all", Array("reads","targets"), query, true ) val reads = spark.read.parquet("/data/granges/NA12878.ga2.exome.maq.recal.adam") reads.createOrReplaceTempView("reads") val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam") targets.createOrReplaceTempView("targets") val mc = new MetricsCollector(spark,metricsTable) mc.initMetricsTable mc.runAndCollectMetrics( "q_featurecounts_adam_wes", "spark_granges_it_bc_all", Array("reads","targets"), query, true )
Example 12
Source File: NewHBaseRDD.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark import org.apache.hadoop.conf.Configuration import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.mapreduce.InputFormat import org.apache.spark.rdd.NewHadoopRDD import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} @InterfaceAudience.Public class NewHBaseRDD[K,V](@transient val sc : SparkContext, @transient val inputFormatClass: Class[_ <: InputFormat[K, V]], @transient val keyClass: Class[K], @transient val valueClass: Class[V], @transient private val __conf: Configuration, val hBaseContext: HBaseContext) extends NewHadoopRDD(sc, inputFormatClass, keyClass, valueClass, __conf) { override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = { hBaseContext.applyCreds() super.compute(theSplit, context) } }