org.apache.hadoop.io.LongWritable Scala Examples
The following examples show how to use org.apache.hadoop.io.LongWritable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ShxReaderSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.TestSparkContext import magellan.io.PolygonReader import org.apache.commons.io.EndianUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text} import org.scalatest.FunSuite class ShxReaderSuite extends FunSuite with TestSparkContext { test("Read shx file") { val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath val conf = new Configuration() conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000") val data = sc.newAPIHadoopFile( path, classOf[ShxInputFormat], classOf[Text], classOf[ArrayWritable], conf ).map { case (txt: Text, splits: ArrayWritable) => val fileName = txt.toString val s = splits.get() val size = s.length var i = 0 val v = Array.fill(size)(0L) while (i < size) { v.update(i, s(i).asInstanceOf[LongWritable].get()) i += 1 } (fileName, v) } assert(data.count() === 1) val (fileName, splits) = data.first() assert(fileName === "tl_2016_us_state") // the offsets should be correct val firstOffset = splits(0) val secondOffset = splits(1) // skipping to the first offset in the Shapefile should allow me to read the first polygon val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath val fs = FileSystem.get(sc.hadoopConfiguration) var dis = fs.open(new Path(shpFilePath)) // skip firstOffset # of bytes dis.seek(firstOffset) // skip record number assert(dis.readInt() === 1) // read content length var contentLength = 16 * (dis.readInt() + 4) // extract the shape type var shapeType = EndianUtils.swapInteger(dis.readInt()) // expect a Polygon assert(shapeType === 5) // the first polygon's content should follow from here val polygonReader = new PolygonReader() val polygon = polygonReader.readFields(dis) assert(polygon != null) // seek to the second offset dis.seek(secondOffset) assert(dis.readInt() === 2) } }
Example 2
Source File: LoadReads.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.spark import hammerlab.cli.spark._ import hammerlab.collection.canBuildVector import hammerlab.path._ import org.apache.hadoop.io.LongWritable import org.apache.spark.rdd.AsNewHadoopPartition import org.hammerlab.args.SplitSize import org.seqdoop.hadoop_bam.{ BAMInputFormat, FileVirtualSplit, SAMRecordWritable } import spark_bam._ trait LoadReads { self: PathApp[_] ⇒ def sparkBamLoad(implicit args: SplitSize.Args, path: Path ): BAMRecordRDD = sc.loadSplitsAndReads( path, splitSize = args.maxSplitSize ) def hadoopBamLoad(implicit args: SplitSize.Args, path: Path ): BAMRecordRDD = { args.set val rdd = sc.newAPIHadoopFile( path.toString(), classOf[BAMInputFormat], classOf[LongWritable], classOf[SAMRecordWritable] ) val reads = rdd .values .map(_.get()) val partitions = rdd .partitions .map(AsNewHadoopPartition(_)) .map[Split, Vector[Split]]( _ .serializableHadoopSplit .value .asInstanceOf[FileVirtualSplit]: Split ) BAMRecordRDD(partitions, reads) } }
Example 3
Source File: BigQueryReader.scala From sope with Apache License 2.0 | 5 votes |
package com.sope.spark.utils.google import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, GsonBigQueryInputFormat} import com.google.gson.JsonObject import com.sope.utils.Logging import org.apache.hadoop.io.LongWritable import org.apache.spark.sql.{DataFrame, SQLContext} def load(): DataFrame = { import sqlContext.implicits._ // Load data from BigQuery. val tableData = sc.newAPIHadoopRDD( conf, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[JsonObject]) .map(_._2.toString) sqlContext.read.json(tableData.toDS) } }
Example 4
Source File: DenseKMeans.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.hadoop.io.LongWritable import org.apache.log4j.{Level, Logger} import org.apache.mahout.math.VectorWritable import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.{SparkConf, SparkContext} import scopt.OptionParser object DenseKMeans { object InitializationMode extends Enumeration { type InitializationMode = Value val Random, Parallel = Value } import com.intel.hibench.sparkbench.ml.DenseKMeans.InitializationMode._ case class Params( input: String = null, k: Int = -1, numIterations: Int = 10, initializationMode: InitializationMode = Parallel) def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DenseKMeans") { head("DenseKMeans: an example k-means app for dense data.") opt[Int]('k', "k") .required() .text(s"number of clusters, required") .action((x, c) => c.copy(k = x)) opt[Int]("numIterations") .text(s"number of iterations, default; ${defaultParams.numIterations}") .action((x, c) => c.copy(numIterations = x)) opt[String]("initMode") .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + s"default: ${defaultParams.initializationMode}") .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) arg[String]("<input>") .text("input paths to examples") .required() .action((x, c) => c.copy(input = x)) } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DenseKMeans with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) // Logger.getRootLogger.setLevel(Level.WARN) val data = sc.sequenceFile[LongWritable, VectorWritable](params.input) val examples = data.map { case (k, v) => var vector: Array[Double] = new Array[Double](v.get().size) for (i <- 0 until v.get().size) vector(i) = v.get().get(i) Vectors.dense(vector) }.cache() // val examples = sc.textFile(params.input).map { line => // Vectors.dense(line.split(' ').map(_.toDouble)) // }.cache() val numExamples = examples.count() println(s"numExamples = $numExamples.") val initMode = params.initializationMode match { case Random => KMeans.RANDOM case Parallel => KMeans.K_MEANS_PARALLEL } val model = new KMeans() .setInitializationMode(initMode) .setK(params.k) .setMaxIterations(params.numIterations) .run(examples) val cost = model.computeCost(examples) println(s"Total cost = $cost.") sc.stop() } }
Example 5
Source File: HBaseBulkPutExampleFromFile.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }
Example 6
Source File: unittest.scala From bdg-sequila with Apache License 2.0 | 5 votes |
import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader case class Region(contigName:String,start:Int,end:Int) val query ="""SELECT count(*),targets.contigName,targets.start,targets.end FROM reads JOIN targets |ON ( | targets.contigName=reads.contigName | AND | reads.end >= targets.start | AND | reads.start <= targets.end |) |GROUP BY targets.contigName,targets.start,targets.end |having contigName='chr1' AND start=20138 AND end=20294""".stripMargin if(true){ spark .sparkContext .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]("/tmp/NA12878.slice.bam") .map(_._2.get) .map(r => Region(r.getContig, r.getStart, r.getEnd)) val reads = spark.sqlContext.createDataFrame(alignments) reads.createOrReplaceTempView("reads") val targets = spark.sqlContext.createDataFrame(Array(Region("chr1",20138,20294))) targets.createOrReplaceTempView("targets") spark.sql(query).explain(false) if(spark.sql(query).first().getLong(0) == 1484L) println("TEST PASSED") else "TEST FAILED" } System.exit(0)
Example 7
Source File: unittest.scala From bdg-sequila with Apache License 2.0 | 5 votes |
import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader case class Region(contigName:String,start:Int,end:Int) val query ="""SELECT count(*),targets.contigName,targets.start,targets.end FROM reads JOIN targets |ON ( | targets.contigName=reads.contigName | AND | reads.end >= targets.start | AND | reads.start <= targets.end |) |GROUP BY targets.contigName,targets.start,targets.end |having contigName='chr1' AND start=20138 AND end=20294""".stripMargin if(true){ spark .sparkContext .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]("/tmp/NA12878.slice.bam") .map(_._2.get) .map(r => Region(r.getContig, r.getStart, r.getEnd)) val reads = spark.sqlContext.createDataFrame(alignments) reads.createOrReplaceTempView("reads") val targets = spark.sqlContext.createDataFrame(Array(Region("chr1",20138,20294))) targets.createOrReplaceTempView("targets") spark.sql(query).explain(false) if(spark.sql(query).first().getLong(0) == 1484L) println("TEST PASSED") else "TEST FAILED" } System.exit(0)
Example 8
Source File: FeatureCountsTestSuite.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.rangejoins import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.biodatageeks.sequila.apps.FeatureCounts.Region import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} import org.scalatest.{BeforeAndAfter, FunSuite} import org.seqdoop.hadoop_bam.util.SAMHeaderReader import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} class FeatureCountsTestSuite extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext { before { System.setSecurityManager(null) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim( spark) :: Nil } test("Feature counts for chr1:20138-20294") { val query = s""" | SELECT count(*),targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END} | FROM reads JOIN targets |ON ( | targets.${Columns.CONTIG}=reads.${Columns.CONTIG} | AND | reads.${Columns.END} >= targets.${Columns.START} | AND | reads.${Columns.START} <= targets.${Columns.END} |) | GROUP BY targets.${Columns.CONTIG},targets.${Columns.START},targets.${Columns.END} | HAVING ${Columns.CONTIG}='1' AND ${Columns.START} = 20138 AND ${Columns.END} = 20294""".stripMargin spark.sparkContext.hadoopConfiguration.set( SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark.sparkContext .newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]( getClass.getResource("/NA12878.slice.bam").getPath) .map(_._2.get) .map(r => Region(DataQualityFuncs.cleanContig(r.getContig), r.getStart, r.getEnd)) val reads = spark.sqlContext .createDataFrame(alignments) .withColumnRenamed("contigName", Columns.CONTIG) .withColumnRenamed("start", Columns.START) .withColumnRenamed("end", Columns.END) reads.createOrReplaceTempView("reads") val targets = spark.sqlContext .createDataFrame(Array(Region("1", 20138, 20294))) .withColumnRenamed("contigName", Columns.CONTIG) .withColumnRenamed("start", Columns.START) .withColumnRenamed("end", Columns.END) targets.createOrReplaceTempView("targets") spark.sql(query).explain(false) assert(spark.sql(query).first().getLong(0) === 1484L) } }
Example 9
Source File: FeatureCounts.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.apps import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.apache.spark.sql.SparkSession import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.utils.Columns import org.rogach.scallop.ScallopConf import org.seqdoop.hadoop_bam.{BAMInputFormat, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader object FeatureCounts { case class Region(contig:String, pos_start:Int, pos_end:Int) class RunConf(args:Array[String]) extends ScallopConf(args){ val output = opt[String](required = true) val annotations = opt[String](required = true) val readsFile = trailArg[String](required = true) val Format = trailArg[String](required = false) verify() } def main(args: Array[String]): Unit = { val runConf = new RunConf(args) val spark = SparkSession .builder() .appName("SeQuiLa-FC") .getOrCreate() spark.sqlContext.setConf("spark.biodatageeks.rangejoin.useJoinOrder","true") //spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (1024).toString) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil val query ="""SELECT targets.GeneId AS GeneId, targets.Chr AS Chr, targets.Start AS Start, targets.End AS End, targets.Strand AS Strand, CAST(targets.End AS INTEGER)-CAST(targets.Start AS INTEGER) + 1 AS Length, count(*) AS Counts FROM reads JOIN targets |ON ( | targets.Chr=reads.contigName | AND | reads.end >= CAST(targets.Start AS INTEGER) | AND | reads.start <= CAST(targets.End AS INTEGER) |) |GROUP BY targets.GeneId,targets.Chr,targets.Start,targets.End,targets.Strand""".stripMargin spark .sparkContext .setLogLevel("ERROR") spark .sparkContext .hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) val alignments = spark .sparkContext.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat](runConf.readsFile()) .map(_._2.get) .map(r => Region(r.getContig, r.getStart, r.getEnd)) val readsTable = spark.sqlContext.createDataFrame(alignments) readsTable.createOrReplaceTempView("reads") val targets = spark .read .option("header", "true") .option("delimiter", "\t") .csv(runConf.annotations()) targets .withColumnRenamed("contigName", Columns.CONTIG) .createOrReplaceTempView("targets") spark.sql(query) .orderBy("GeneId") .coalesce(1) .write .option("header", "true") .option("delimiter", "\t") .csv(runConf.output()) } }
Example 10
Source File: featureCounts.scala From bdg-sequila with Apache License 2.0 | 5 votes |
import htsjdk.samtools.ValidationStringency import org.apache.hadoop.io.LongWritable import org.apache.spark.SparkContext import org.apache.spark.rdd.NewHadoopRDD import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim import org.biodatageeks.sequila.rangejoins.common.metrics.MetricsCollector import org.seqdoop.hadoop_bam.{BAMInputFormat, FileVirtualSplit, SAMRecordWritable} import org.seqdoop.hadoop_bam.util.SAMHeaderReader val metricsTable = "granges.metrics" sc.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString) case class PosRecord(contigName:String,start:Int,end:Int) spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil val alignments = sc.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]("/data/granges/NA12878.ga2.exome.maq.recal.bam").map(_._2.get).map(r=>PosRecord(r.getContig,r.getStart,r.getEnd)) val reads=alignments.toDF reads.createOrReplaceTempView("reads") val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam") targets.createOrReplaceTempView("targets") val query=""" SELECT targets.contigName,targets.start,targets.end,count(*) FROM reads JOIN targets | ON (targets.contigName=reads.contigName | AND | CAST(reads.end AS INTEGER)>=CAST(targets.start AS INTEGER) | AND | CAST(reads.start AS INTEGER)<=CAST(targets.end AS INTEGER) | ) | GROUP BY targets.contigName,targets.start,targets.end""" spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (100 *1024*1024).toString) val mc = new MetricsCollector(spark,metricsTable) mc.initMetricsTable mc.runAndCollectMetrics( "q_featurecounts_bam_wes", "spark_granges_it_bc_all", Array("reads","targets"), query, true ) val reads = spark.read.parquet("/data/granges/NA12878.ga2.exome.maq.recal.adam") reads.createOrReplaceTempView("reads") val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam") targets.createOrReplaceTempView("targets") val mc = new MetricsCollector(spark,metricsTable) mc.initMetricsTable mc.runAndCollectMetrics( "q_featurecounts_adam_wes", "spark_granges_it_bc_all", Array("reads","targets"), query, true )
Example 11
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }
Example 12
Source File: ControlFilesCreator.scala From spark-benchmarks with Apache License 2.0 | 5 votes |
package com.bbva.spark.benchmarks.dfsio import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat import org.apache.spark.SparkContext import org.apache.spark.rdd.{PairRDDFunctions, RDD} object ControlFilesCreator { val BaseFileName = "test_io_" def createFiles(controlDirPath: String, numFiles: Int, fileSize: Long)(implicit sc: SparkContext): Unit = { sc.parallelize(0 until numFiles, numFiles).map(getFileName).map { fileName => val controlFilePath = new Path(controlDirPath, s"in_file_$fileName") (controlFilePath.toString, new LongWritable(fileSize)) }.saveAsSequenceFileByKey(controlDirPath) } implicit class RichRDD[T](val self: RDD[T]) extends AnyVal { def saveAsSequenceFileByKey[K, V](path: String)(implicit ev: RDD[T] => PairRDDFunctions[K, V]): Unit = self.saveAsHadoopFile(path, classOf[Text], classOf[LongWritable], classOf[RDDMultipleSequenceFileOutputFormat]) } private def getFileName(fileIndex: Int): String = BaseFileName + fileIndex class RDDMultipleSequenceFileOutputFormat extends MultipleSequenceFileOutputFormat[Any, Any] { override def generateActualKey(key: Any, value: Any): Any = new Text(key.toString.split("/").last) override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = new Path(key.toString).toString } }
Example 13
Source File: HBaseBulkPutExampleFromFile.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.spark.SparkConf object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }
Example 14
Source File: IndexedBinaryBlockReader.scala From hail with MIT License | 5 votes |
package is.hail.io import is.hail.annotations.RegionValueBuilder import is.hail.io.fs.{HadoopFS, WrappedSeekableDataInputStream} import org.apache.commons.logging.{Log, LogFactory} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.LongWritable import org.apache.hadoop.mapred._ abstract class KeySerializedValueRecord[K] extends Serializable { var input: Array[Byte] = _ var key: K = _ def setSerializedValue(arr: Array[Byte]) { this.input = arr } def getValue(rvb: RegionValueBuilder, includeGT: Boolean): Unit def setKey(k: K) { this.key = k } def getKey: K = key } abstract class IndexedBinaryBlockReader[T](job: Configuration, split: FileSplit) extends RecordReader[LongWritable, T] { val LOG: Log = LogFactory.getLog(classOf[IndexedBinaryBlockReader[T]].getName) val partitionStart: Long = split.getStart var pos: Long = partitionStart val end: Long = partitionStart + split.getLength val bfis = openFile() def openFile(): HadoopFSDataBinaryReader = { val file: Path = split.getPath val fs: FileSystem = file.getFileSystem(job) val is = fs.open(file) new HadoopFSDataBinaryReader( new WrappedSeekableDataInputStream( HadoopFS.toSeekableInputStream(is))) } def createKey(): LongWritable = new LongWritable() def createValue(): T def getPos: Long = pos def getProgress: Float = { if (partitionStart == end) 0.0f else Math.min(1.0f, (pos - partitionStart) / (end - partitionStart).toFloat) } def close() = bfis.close() }
Example 15
Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.api.services.bigquery.model.{TableReference, TableSchema} import com.google.cloud.hadoop.io.bigquery._ import com.google.gson._ import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, NullWritable} import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.sql.DataFrame import org.slf4j.LoggerFactory import scala.util.Random def saveAsBigQueryTable(fullyQualifiedOutputTableId: String, isPartitionedByDay: Boolean = false, timePartitionExpiration: Long = 0, writeDisposition: WriteDisposition.Value = null, createDisposition: CreateDisposition.Value = null): Unit = { val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf) val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema) bq.load(destinationTable, bigQuerySchema, gcsPath, isPartitionedByDay, timePartitionExpiration, writeDisposition, createDisposition) delete(new Path(gcsPath)) } def writeDFToGoogleStorage(adaptedDf: DataFrame, destinationTable: TableReference, bqSchema: TableSchema): String = { val tableName = BigQueryStrings.toString(destinationTable) BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString()) hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName) val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp" if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) { hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath) } logger.info(s"Loading $gcsPath into $tableName") adaptedDf .toJSON .rdd .map(json => (null, jsonParser.parse(json))) .saveAsNewAPIHadoopFile(gcsPath, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[TextOutputFormat[NullWritable, JsonObject]], hadoopConf) gcsPath } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, hadoopConf) fs.delete(path, true) } }
Example 16
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 17
Source File: ShapefileRelation.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import java.util.Objects import magellan.io._ import magellan.mapreduce._ import org.apache.hadoop.io.{ArrayWritable, LongWritable, MapWritable, Text} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import scala.collection.JavaConversions._ import scala.util.Try case class ShapeFileRelation( path: String, parameters: Map[String, String]) (@transient val sqlContext: SQLContext) extends SpatialRelation { protected override def _buildScan(): RDD[Array[Any]] = { // read the shx files, if they exist val fileNameToFileSplits = Try(sc.newAPIHadoopFile( path + "/*.shx", classOf[ShxInputFormat], classOf[Text], classOf[ArrayWritable] ).map { case (txt: Text, splits: ArrayWritable) => val fileName = txt.toString val s = splits.get() val size = s.length var i = 0 val v = Array.fill(size)(0L) while (i < size) { v.update(i, s(i).asInstanceOf[LongWritable].get()) i += 1 } (fileName, v) }.collectAsMap()) fileNameToFileSplits.map(SplitInfos.SPLIT_INFO_MAP.set(_)) val shapefileRdd = sqlContext.sparkContext.newAPIHadoopFile( path + "/*.shp", classOf[ShapeInputFormat], classOf[ShapeKey], classOf[ShapeWritable] ) val dbaseRdd = sqlContext.sparkContext.newAPIHadoopFile( path + "/*.dbf", classOf[DBInputFormat], classOf[ShapeKey], classOf[MapWritable] ) val dataRdd = shapefileRdd.map { case (k, v) => ((k.getFileNamePrefix(), k.getRecordIndex()), v.shape) } val metadataRdd = dbaseRdd.map { case (k, v) => val meta = v.entrySet().map { kv => val k = kv.getKey.asInstanceOf[Text].toString val v = kv.getValue.asInstanceOf[Text].toString (k, v) }.toMap ((k.getFileNamePrefix(), k.getRecordIndex()), meta) } dataRdd.leftOuterJoin(metadataRdd).map(f => Array(f._2._1, f._2._2)) } override def hashCode(): Int = Objects.hash(path, schema) }
Example 18
Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object FPMiningPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => (r._1, r._2)) .distinct() .groupByKey() .map(r => r._2.mkString(" ")) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 19
Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object CollabFilteringPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => ((r._1, r._2), 1)) .reduceByKey(_ + _) .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 20
Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditAggregationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditAggregationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val recCount = comments.count() val recCountValue = comments.countByValue() val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) .flatMap(body => body.split(" ")) .map(word => 1) .reduce(_ + _) ssc.start() ssc.awaitTermination() } }
Example 21
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 22
Source File: L3-1DStreams.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.hadoop.io.LongWritable import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.hadoop.io.Text object StreamingTranslateApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StreamingTranslateApp <appname> <book_path> <output_path> <language>") System.exit(1) } val Seq(appName, bookPath, outputPath, lang) = args.toSeq val dict = getDictionary(lang) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) val book = ssc.textFileStream(bookPath) val translated = book.map(line => line.split("\\s+").map(word => dict.getOrElse(word, word)).mkString(" ")) translated.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } def getDictionary(lang: String): Map[String, String] = { if (!Set("German", "French", "Italian", "Spanish").contains(lang)) { System.err.println( "Unsupported language: %s".format(lang)) System.exit(1) } val url = "http://www.june29.com/IDP/files/%s.txt".format(lang) println("Grabbing dictionary from: %s".format(url)) Source.fromURL(url, "ISO-8859-1").mkString .split("\\r?\\n") .filter(line => !line.startsWith("#")) .map(line => line.split("\\t")) .map(tkns => (tkns(0).trim, tkns(1).trim)).toMap } }
Example 23
Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditVariationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditVariationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val merged = comments.union(comments) val repartitionedComments = comments.repartition(4) val rddMin = comments.glom().map(arr => arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) ssc.start() ssc.awaitTermination() } }
Example 24
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 25
Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditMappingApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditMappingApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val sdf = new SimpleDateFormat("yyyy-MM-dd") val tsKey = "created_utc" val secs = 1000L val keyedByDay = comments.map(rec => { val ts = (parse(rec) \ tsKey).values (sdf.format(new Date(ts.toString.toLong * secs)), rec) }) val keyedByDayPart = comments.mapPartitions(iter => { var ret = List[(String, String)]() while (iter.hasNext) { val rec = iter.next val ts = (parse(rec) \ tsKey).values ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) } ret.iterator }) val wordTokens = comments.map(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val wordTokensFlat = comments.flatMap(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val filterSubreddit = comments.filter(rec => (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) val sortedByAuthor = comments.transform(rdd => (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) ssc.start() ssc.awaitTermination() } }
Example 26
Source File: L4-4Kryo.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerAppKryo { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerAppKryo <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .registerKryoClasses(Array(classOf[ProtonFlux])) val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val projected = voyager1.map(rec => { val attrs = rec.split("\\s+") new ProtonFlux(attrs(0), attrs(18), attrs(19), attrs(20), attrs(21), attrs(22), attrs(23), attrs(24), attrs(25), attrs(26), attrs(27), attrs(28)) }) val filtered = projected.filter(pflux => pflux.isSolarStorm) val yearlyBreakdown = filtered.map(rec => (rec.year, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false)) yearlyBreakdown.saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 27
Source File: L4-1Voyager.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.fs.Path import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object VoyagerApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: VoyagerApp <appname> <inputPath> <outputPath>") System.exit(1) } val Seq(appName, inputPath, outputPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.executor.extraJavaOptions", "-XX:+UseConcMarkSweepGC") val ssc = new StreamingContext(conf, Seconds(10)) val voyager1 = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) voyager1.map(rec => { val attrs = rec.split("\\s+") ((attrs(0).toInt), attrs.slice(18, 28).map(_.toDouble)) }).filter(pflux => pflux._2.exists(_ > 1.0)).map(rec => (rec._1, 1)) .reduceByKey(_ + _) .transform(rec => rec.sortByKey(ascending = false, numPartitions = 1)).saveAsTextFiles(outputPath) ssc.start() ssc.awaitTermination() } }
Example 28
Source File: Extractors.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package com.memsql.streamliner.examples import org.apache.spark._ import org.apache.spark.rdd._ import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.sql._ import org.apache.spark.sql.types._ import com.memsql.spark.connector._ import com.memsql.spark.etl.api._ import com.memsql.spark.etl.utils._ import com.memsql.spark.etl.utils.PhaseLogger import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat // The simplest implementation of an Extractor just provides a next method. // This is useful for prototyping and debugging. class ConstantExtractor extends Extractor { override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { logger.info("extracting a constant sequence DataFrame") val schema = StructType(StructField("number", IntegerType, false) :: Nil) val sampleData = List(1,2,3,4,5) val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_)) val df = sqlContext.createDataFrame(rowRDD, schema) Some(df) } } // An Extractor can also be configured with the config blob that is provided in // MemSQL Ops. class ConfigurableConstantExtractor extends Extractor { override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { val userConfig = config.asInstanceOf[UserExtractConfig] val start = userConfig.getConfigInt("start").getOrElse(1) val end = userConfig.getConfigInt("end").getOrElse(5) val columnName = userConfig.getConfigString("column_name").getOrElse("number") logger.info("extracting a sequence DataFrame from $start to $end") val schema = StructType(StructField(columnName, IntegerType, false) :: Nil) val sampleData = List.range(start, end + 1) val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_)) val df = sqlContext.createDataFrame(rowRDD, schema) Some(df) } } // A more complex Extractor which maintains some state can be implemented using // the initialize and cleanup methods. class SequenceExtractor extends Extractor { var i: Int = Int.MinValue override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { val userConfig = config.asInstanceOf[UserExtractConfig] i = userConfig.getConfigInt("sequence", "initial_value").getOrElse(0) logger.info(s"initializing the sequence at $i") } override def cleanup(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = { logger.info("cleaning up the sequence") } override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = { val userConfig = config.asInstanceOf[UserExtractConfig] val sequenceSize = userConfig.getConfigInt("sequence", "size").getOrElse(5) logger.info(s"emitting a sequence RDD from $i to ${i + sequenceSize}") val schema = StructType(StructField("number", IntegerType, false) :: Nil) i += sequenceSize val sampleData = List.range(i - sequenceSize, i) val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_)) val df = sqlContext.createDataFrame(rowRDD, schema) Some(df) } }
Example 29
Source File: XmlFile.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.io.CharArrayWriter import java.nio.charset.Charset import javax.xml.stream.XMLOutputFactory import scala.collection.Map import com.databricks.spark.xml.parsers.StaxXmlGenerator import com.sun.xml.txw2.output.IndentingXMLStreamWriter import org.apache.hadoop.io.{Text, LongWritable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.DataFrame import com.databricks.spark.xml.{XmlOptions, XmlInputFormat} private[xml] object XmlFile { val DEFAULT_INDENT = " " def withCharset( context: SparkContext, location: String, charset: String, rowTag: String): RDD[String] = { // This just checks the charset's validity early, to keep behavior Charset.forName(charset) context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset) context.newAPIHadoopFile(location, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) } } def saveAsXmlFile( dataFrame: DataFrame, path: String, parameters: Map[String, String] = Map()): Unit = { val options = XmlOptions(parameters.toMap) val codecClass = CompressionCodecs.getCodecClass(options.codec) val rowSchema = dataFrame.schema val indent = XmlFile.DEFAULT_INDENT val xmlRDD = dataFrame.rdd.mapPartitions { iter => val factory = XMLOutputFactory.newInstance() val writer = new CharArrayWriter() val xmlWriter = factory.createXMLStreamWriter(writer) val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter) indentingXmlWriter.setIndentStep(indent) new Iterator[String] { var firstRow: Boolean = true var lastRow: Boolean = true override def hasNext: Boolean = iter.hasNext || firstRow || lastRow override def next: String = { if (iter.nonEmpty) { if (firstRow) { indentingXmlWriter.writeStartElement(options.rootTag) firstRow = false } val xml = { StaxXmlGenerator( rowSchema, indentingXmlWriter, options)(iter.next()) indentingXmlWriter.flush() writer.toString } writer.reset() xml } else { if (!firstRow) { lastRow = false indentingXmlWriter.writeEndElement() indentingXmlWriter.close() writer.toString } else { // This means the iterator was initially empty. firstRow = false lastRow = false "" } } } } } codecClass match { case null => xmlRDD.saveAsTextFile(path) case codec => xmlRDD.saveAsTextFile(path, codec) } } }
Example 30
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }