org.apache.hadoop.mapred.TextInputFormat Scala Examples
The following examples show how to use org.apache.hadoop.mapred.TextInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CreateHiveTableAsSelectCommand.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = true, ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 2
Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object CollabFilteringPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => ((r._1, r._2), 1)) .reduceByKey(_ + _) .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 3
Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object FPMiningPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => (r._1, r._2)) .distinct() .groupByKey() .map(r => r._2.mkString(" ")) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 4
Source File: CreateHiveTableAsSelectCommand.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = OverwriteOptions(true), ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 5
Source File: CreateHiveTableAsSelectCommand.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = OverwriteOptions(true), ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 6
Source File: CreateTableAsSelect.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.annotation.Experimental import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { def database: String = tableDesc.database def tableName: String = tableDesc.name override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withSchema = tableDesc.copy( schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)), inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(Seq(database, tableName))) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$database.$tableName already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString } }
Example 7
Source File: HBaseBulkPutExampleFromFile.scala From SparkOnHBase with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.spark.SparkConf object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }
Example 8
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }
Example 9
Source File: CreateTableAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes, MetastoreRelation} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def children: Seq[LogicalPlan] = Seq(query) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.copy( inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null))) } else { withFormat } hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(tableIdentifier, None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]" } }
Example 10
Source File: HBaseBulkPutExampleFromFile.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.yetus.audience.InterfaceAudience @InterfaceAudience.Private object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } }