org.apache.hadoop.io.IntWritable Scala Examples
The following examples show how to use org.apache.hadoop.io.IntWritable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SequenceSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringWriter import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings} import io.eels.{Row, Sink, SinkWriter} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink { override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path) class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[BytesWritable]) ) val key = new IntWritable(0) val headers = valuesToCsv(schema.fieldNames()) writer.append(key, new BytesWritable(headers.getBytes)) override def close(): Unit = writer.close() override def write(row: Row): Unit = { this.synchronized { val csv = valuesToCsv(row.values) writer.append(key, new BytesWritable(csv.getBytes())) key.set(key.get() + 1) } } private def valuesToCsv(values: Seq[Any]): String = { val swriter = new StringWriter() val csv = new CsvWriter(swriter, new CsvWriterSettings()) csv.writeRow(values.map { case null => null case other => other.toString }: _*) csv.close() swriter.toString().trim() } } }
Example 2
Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringReader import java.nio.charset.Charset import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.csv.{CsvFormat, CsvSupport} import io.eels.schema.{Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} object SequenceSupport extends Logging with Using { def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path)) def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8"))) def toValues(str: String): Array[String] = { val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null) parser.beginParsing(new StringReader(str)) val record = parser.parseNext() parser.stopParsing() record } def schema(path: Path)(implicit conf: Configuration): StructType = { logger.debug(s"Fetching sequence schema for $path") using(createReader(path)) { it => val k = new IntWritable() val v = new BytesWritable() val fields: Array[Field] = { it.next(k, v) toValues(v).map { it => new Field(it) } } StructType(fields.toList) } } }
Example 3
Source File: SequenceSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging { logger.debug(s"Creating sequence source from $path") override def schema: StructType = SequenceSupport.schema(path) override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path)) } object SequenceReaderIterator { def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] { private val k = new IntWritable() private val v = new BytesWritable() // throw away the header reader.next(k, v) override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector) override def hasNext(): Boolean = reader.next(k, v) } } class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(SequenceSupport.createReader(path)) { reader => val schema = SequenceSupport.schema(path) val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) SequenceReaderIterator(schema, reader) .takeWhile(_ => running.get) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 4
Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} import org.scalatest.{Matchers, WordSpec} class SequenceSinkTest extends WordSpec with Matchers { private val ds = DataStream.fromValues( StructType("a", "b", "c", "d"), Seq( List("1", "2", "3", "4"), List("5", "6", "7", "8") ) ) "SequenceSink" should { "write sequence files" in { implicit val conf = new Configuration implicit val fs = FileSystem.get(conf) val path = new Path("seqsink.seq") if (fs.exists(path)) fs.delete(path, true) ds.to(SequenceSink(path)) val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path)) val k = new IntWritable val v = new BytesWritable val set = for (_ <- 1 to 3) yield { reader.next(k, v) new String(v.copyBytes) } set.toSet shouldBe Set( "a,b,c,d", "1,2,3,4", "5,6,7,8" ) reader.close() fs.delete(path, true) } } }
Example 5
Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditMappingApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditMappingApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val sdf = new SimpleDateFormat("yyyy-MM-dd") val tsKey = "created_utc" val secs = 1000L val keyedByDay = comments.map(rec => { val ts = (parse(rec) \ tsKey).values (sdf.format(new Date(ts.toString.toLong * secs)), rec) }) val keyedByDayPart = comments.mapPartitions(iter => { var ret = List[(String, String)]() while (iter.hasNext) { val rec = iter.next val ts = (parse(rec) \ tsKey).values ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) } ret.iterator }) val wordTokens = comments.map(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val wordTokensFlat = comments.flatMap(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val filterSubreddit = comments.filter(rec => (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) val sortedByAuthor = comments.transform(rdd => (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) ssc.start() ssc.awaitTermination() } }
Example 6
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 7
Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditVariationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditVariationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val merged = comments.union(comments) val repartitionedComments = comments.repartition(4) val rddMin = comments.glom().map(arr => arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) ssc.start() ssc.awaitTermination() } }
Example 8
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 9
Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditAggregationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditAggregationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val recCount = comments.count() val recCountValue = comments.countByValue() val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) .flatMap(body => body.split(" ")) .map(word => 1) .reduce(_ + _) ssc.start() ssc.awaitTermination() } }
Example 10
Source File: ErrorEventsWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import java.io.OutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile, Text} import org.json4s.jackson.Serialization import org.json4s.{DefaultFormats, Formats, ShortTypeHints} import yamrcraft.etlite.utils.FileUtils case class ErrorInfo( errorType: String, errorMsg: Option[String] ) class ErrorEventWriter(folder: String, jobId: Long, partitionId: Int) extends ErrorEventsWriter { // incremental record id var recordId = 1 val fs = FileUtils.getFS(folder) val seqPath = new Path(folder, s"errors_job${jobId}_part$partitionId.seq") if (fs.exists(seqPath)) { fs.delete(seqPath, false) } val metaPath = new Path(folder, s"errors_job${jobId}_part$partitionId.meta.seq") if (fs.exists(metaPath)) { fs.delete(metaPath, false) } private var seqWriter: Option[SequenceFile.Writer] = None private var metaWriter: Option[SequenceFile.Writer] = None implicit val formats = new Formats { val dateFormat = DefaultFormats.lossless.dateFormat override val typeHints = ShortTypeHints(List(classOf[ErrorInfo])) override val typeHintFieldName = "type" } override def write(errorEvent: (Array[Byte], ErrorInfo)) = { if (seqWriter.isEmpty) { seqWriter = createSequenceFile(seqPath, classOf[IntWritable], classOf[BytesWritable]) metaWriter = createSequenceFile(metaPath, classOf[IntWritable], classOf[Text]) } val id = new IntWritable(recordId) seqWriter.get.append(id, new BytesWritable(errorEvent._1)) metaWriter.get.append(id, new Text(Serialization.write(errorEvent._2))) recordId += 1 } override def commit() = { seqWriter.foreach(p => p.close()) metaWriter.foreach(p => p.close()) } private def createSequenceFile(path: Path, keyClass: Class[_], valueClass: Class[_]) = { val optPath = SequenceFile.Writer.file(path) val optKey = SequenceFile.Writer.keyClass(keyClass) val optVal = SequenceFile.Writer.valueClass(valueClass) Some(SequenceFile.createWriter(fs.getConf, optPath, optKey, optVal)) } }