org.apache.spark.rdd.HadoopRDD Scala Examples
The following examples show how to use org.apache.spark.rdd.HadoopRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JavaHadoopRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 2
Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object CollabFilteringPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => ((r._1, r._2), 1)) .reduceByKey(_ + _) .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 3
Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object FPMiningPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => (r._1, r._2)) .distinct() .groupByKey() .map(r => r._2.mkString(" ")) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 4
Source File: JavaHadoopRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 5
Source File: JavaHadoopRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 6
Source File: JavaHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 7
Source File: JavaHadoopRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 8
Source File: JavaHadoopRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 9
Source File: JavaHadoopRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 10
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }
Example 11
Source File: JavaHadoopRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }