org.apache.hadoop.mapred.TextOutputFormat Java Exaples

Source File: PersonVersion.java From blog with MIT License

6 votes

private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass,
                             Class<? extends Reducer> reduceClass) throws Exception {
    JobConf conf = new JobConf(PersonVersion.class);
    conf.setJobName(jobName);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(mapClass);
    conf.setCombinerClass(reduceClass);
    conf.setReducerClass(reduceClass);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    JobClient.runJob(conf);
}

Source File: HdfsHelper.java From DataLink with Apache License 2.0

6 votes

TextWriterProxy(Configuration config, String fileName) throws IOException{
	fieldDelimiter = config.getChar(Key.FIELD_DELIMITER);
       columns = config.getListConfiguration(Key.COLUMN);
       
       String compress = config.getString(Key.COMPRESS,null);
       SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmm");
       String attempt = "attempt_"+dateFormat.format(new Date())+"_0001_m_000000_0";
       Path outputPath = new Path(fileName);
       //todo 需要进一步确定TASK_ATTEMPT_ID
       conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
       FileOutputFormat outFormat = new TextOutputFormat();
       outFormat.setOutputPath(conf, outputPath);
       outFormat.setWorkOutputPath(conf, outputPath);
       if(null != compress) {
           Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
           if (null != codecClass) {
               outFormat.setOutputCompressorClass(conf, codecClass);
           }
       }
       
       writer = outFormat.getRecordWriter(fileSystem, conf, outputPath.toString(), Reporter.NULL);
}

Source File: SliveTest.java From hadoop with Apache License 2.0

6 votes

/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}

Source File: WordCountInput.java From aerospike-hadoop with Apache License 2.0

6 votes

public int run(final String[] args) throws Exception {

        log.info("run starting");

        final Configuration conf = getConf();

        JobConf job = new JobConf(conf, WordCountInput.class);
        job.setJobName("AerospikeWordCountInput");

        job.setInputFormat(AerospikeInputFormat.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputFormat(TextOutputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path(args[0]));

        JobClient.runJob(job);

        log.info("finished");
        return 0;
    }

Source File: CartesianCommentComparison.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
	if (args.length != 2) {
		System.err.println("Usage: CartesianCommentComparison <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	// Configure the join type
	JobConf conf = new JobConf("Cartesian Product");
	conf.setJarByClass(CartesianCommentComparison.class);
	conf.setMapperClass(CartesianMapper.class);
	conf.setNumReduceTasks(0);
	conf.setInputFormat(CartesianInputFormat.class);
	// Configure the input format
	CartesianInputFormat.setLeftInputInfo(conf, TextInputFormat.class, args[0]);
	CartesianInputFormat.setRightInputInfo(conf, TextInputFormat.class, args[0]);
	TextOutputFormat.setOutputPath(conf, new Path(args[1]));
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	RunningJob job = JobClient.runJob(conf);
	while (!job.isComplete()) {
		Thread.sleep(1000);
	}
	return job.isSuccessful() ? 0 : 1;
}

Source File: SliveTest.java From big-c with Apache License 2.0

6 votes

/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}

Source File: WordCount.java From attic-apex-malhar with Apache License 2.0

6 votes

public void run(String[] args) throws Exception
{

  JobConf conf = new JobConf(this.getClass());
  conf.setJobName("wordcount");

  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);

  conf.setMapperClass(Map.class);
  conf.setCombinerClass(Reduce.class);
  conf.setReducerClass(Reduce.class);

  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(TextOutputFormat.class);

  FileInputFormat.setInputPaths(conf, new Path(args[0]));
  FileOutputFormat.setOutputPath(conf, new Path(args[1]));

  JobClient.runJob(conf);
}

Source File: HadoopMapredCompatWordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

5 votes

public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);
  if (retry != null) job.setInt("retry", retry);
  
  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}

Source File: MultipleTextOutputFormat.java From RDFS with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

5 votes

/**
 * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
 */
public static void addDependencyJars(JobConf job) throws IOException {
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJarsForClasses(
    job,
    job.getMapOutputKeyClass(),
    job.getMapOutputValueClass(),
    job.getOutputKeyClass(),
    job.getOutputValueClass(),
    job.getPartitionerClass(),
    job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
    job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
    job.getCombinerClass());
}

Source File: HadoopMapredCompatWordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}

Source File: HadoopWordCount1.java From ignite with Apache License 2.0

5 votes

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param jobConf Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner, boolean setReducer) {
    if (setMapper) {
        jobConf.setMapperClass(HadoopWordCount1Map.class);
        jobConf.setInputFormat(TextInputFormat.class);
    }

    if (setCombiner)
        jobConf.setCombinerClass(HadoopWordCount1Reduce.class);

    if (setReducer) {
        jobConf.setReducerClass(HadoopWordCount1Reduce.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
    }
}

Source File: CompositeUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
	if (args.length != 4) {
		printUsage();
	}
	Path userPath = new Path(args[0]);
	Path commentPath = new Path(args[1]);
	Path outputDir = new Path(args[2]);
	String joinType = args[3];
	JobConf conf = new JobConf("CompositeJoin");
	conf.setJarByClass(CompositeUserJoin.class);
	conf.setMapperClass(CompositeMapper.class);
	conf.setNumReduceTasks(0);
	// Set the input format class to a CompositeInputFormat class.
	// The CompositeInputFormat will parse all of our input files and output
	// records to our mapper.
	conf.setInputFormat(CompositeInputFormat.class);
	// The composite input format join expression will set how the records
	// are going to be read in, and in what input format.
	conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
			KeyValueTextInputFormat.class, userPath, commentPath));
	TextOutputFormat.setOutputPath(conf, outputDir);
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	RunningJob job = JobClient.runJob(conf);
	while (!job.isComplete()) {
		Thread.sleep(1000);
	}
	return job.isSuccessful() ? 0 : 1;
}

Source File: CrawlDbReader.java From anthelion with Apache License 2.0

5 votes

public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);

  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}

Source File: LogCountsPerHour.java From attic-apex-malhar with Apache License 2.0

5 votes

public int run(String[] args) throws Exception
{
  // Create a configuration
  Configuration conf = getConf();

  // Create a job from the default configuration that will use the WordCount class
  JobConf job = new JobConf(conf, LogCountsPerHour.class);

  // Define our input path as the first command line argument and our output path as the second
  Path in = new Path(args[0]);
  Path out = new Path(args[1]);

  // Create File Input/Output formats for these paths (in the job)
  FileInputFormat.setInputPaths(job, in);
  FileOutputFormat.setOutputPath(job, out);

  // Configure the job: name, mapper, reducer, and combiner
  job.setJobName("LogAveragePerHour");
  job.setMapperClass(LogMapClass.class);
  job.setReducerClass(LogReduce.class);
  job.setCombinerClass(LogReduce.class);

  // Configure the output
  job.setOutputFormat(TextOutputFormat.class);
  job.setOutputKeyClass(DateWritable.class);
  job.setOutputValueClass(IntWritable.class);

  // Run the job
  JobClient.runJob(job);
  return 0;
}

Source File: WordCountWithHadoopOutputFormat.java From stratosphere with Apache License 2.0

5 votes

@Override
public Plan getPlan(String... args) {
	// parse job parameters
	int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
	String dataInput = (args.length > 1 ? args[1] : "");
	String output    = (args.length > 2 ? args[2] : "");

	HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
			new TextInputFormat(), new JobConf(), "Input Lines");
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));


	MapOperator mapper = MapOperator.builder(new TokenizeLine())
			.input(source)
			.name("Tokenize Lines")
			.build();
	ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
			.input(mapper)
			.name("Count Words")
			.build();
	HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class);
	TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

	Plan plan = new Plan(out, "Hadoop OutputFormat Example");
	plan.setDefaultParallelism(numSubTasks);
	return plan;
}

Source File: MultipleTextOutputFormat.java From big-c with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: CopyFromS3.java From emr-sample-apps with Apache License 2.0

5 votes

/**
 * This method constructs the JobConf to be used to run the map reduce job to
 * download the files from S3. This is a potentially expensive method since it
 * makes multiple calls to S3 to get a listing of all the input data. Clients
 * are encouraged to cache the returned JobConf reference and not call this
 * method multiple times unless necessary.
 * 
 * @return the JobConf to be used to run the map reduce job to download the
 *         files from S3.
 */
public JobConf getJobConf() throws IOException, ParseException {
  JobConf conf = new JobConf(CopyFromS3.class);
  conf.setJobName("CopyFromS3");
  conf.setOutputKeyClass(NullWritable.class);
  conf.setOutputValueClass(Text.class);
  conf.setMapperClass(S3CopyMapper.class);
  // We configure a reducer, even though we don't use it right now.
  // The idea is that, in the future we may. 
  conf.setReducerClass(HDFSWriterReducer.class);
  conf.setNumReduceTasks(0);

  FileInputFormat.setInputPaths(conf, new Path(tempFile));
  FileOutputFormat.setOutputPath(conf, new Path(outputPath));
  conf.setOutputFormat(TextOutputFormat.class);
  conf.setCompressMapOutput(true);

  JobClient jobClient = new JobClient(conf);

  FileSystem inputFS = FileSystem.get(URI.create(inputPathPrefix), conf);
  DatePathFilter datePathFilter = new DatePathFilter(startDate, endDate);
  List<Path> filePaths = getFilePaths(inputFS, new Path(inputPathPrefix), datePathFilter, jobClient.getDefaultMaps());

  // Write the file names to a temporary index file to be used
  // as input to the map tasks.
  FileSystem outputFS = FileSystem.get(URI.create(tempFile), conf);
  FSDataOutputStream outputStream = outputFS.create(new Path(tempFile), true);
  try {
    for (Path path : filePaths) {
      outputStream.writeBytes(path.toString() + "\n");
    }
  }
  finally {
    outputStream.close();
  }

  conf.setNumMapTasks(Math.min(filePaths.size(), jobClient.getDefaultMaps()));

  return conf;
}

Source File: MultipleTextOutputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: MultipleTextOutputFormat.java From hadoop with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: HadoopMapredCompatWordCount.java From Flink-CEPplus with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}

Source File: TestUtils.java From circus-train with Apache License 2.0

5 votes

public static Table createPartitionedTable(
    HiveMetaStoreClient metaStoreClient,
    String database,
    String table,
    URI location)
  throws Exception {
  return createPartitionedTable(metaStoreClient, database, table, location, DATA_COLUMNS, PARTITION_COLUMNS,
      "org.apache.hadoop.hive.serde2.OpenCSVSerde", TextInputFormat.class.getName(),
      TextOutputFormat.class.getName());
}

Source File: TestUtils.java From circus-train with Apache License 2.0

5 votes

public static Table createUnpartitionedTable(
    HiveMetaStoreClient metaStoreClient,
    String database,
    String table,
    URI location)
  throws TException {
  Table hiveTable = new Table();
  hiveTable.setDbName(database);
  hiveTable.setTableName(table);
  hiveTable.setTableType(TableType.EXTERNAL_TABLE.name());
  hiveTable.putToParameters("EXTERNAL", "TRUE");

  StorageDescriptor sd = new StorageDescriptor();
  sd.setCols(DATA_COLUMNS);
  sd.setLocation(location.toString());
  sd.setParameters(new HashMap<String, String>());
  sd.setInputFormat(TextInputFormat.class.getName());
  sd.setOutputFormat(TextOutputFormat.class.getName());
  sd.setSerdeInfo(new SerDeInfo());
  sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde");

  hiveTable.setSd(sd);

  metaStoreClient.createTable(hiveTable);

  ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table);
  ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L));
  ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData);
  List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1);
  metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj));

  return hiveTable;
}

Source File: CountKmers.java From emr-sample-apps with Apache License 2.0

4 votes

/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException 
{
	String inpath = null;
	String outpath = null;
	int kmerlen = 0;
	int numMappers = 1;
	int numReducers = 1;
	int showpos = 0;
	
	int data = 1;
	
	if (data == 0)
	{	
		if (args.length != 6)
		{
			System.err.println("Usage: CountKmers filename outpath kmerlen showpos numMappers numReducers");
			return;
		}
		
		inpath      =                  args[0];
		outpath     =                  args[1];
		kmerlen     = Integer.parseInt(args[2]);
		showpos     = Integer.parseInt(args[3]);
		numMappers  = Integer.parseInt(args[4]);
		numReducers = Integer.parseInt(args[5]);
	}
	else if (data == 1)
	{
		inpath = "/user/guest/cloudburst/s_suis.br";
		outpath = "/user/mschatz/kmers";
		kmerlen = 12;
		showpos = 0;
		numMappers = 1;
		numReducers = 1;
	}
	
	System.out.println("inpath: " + inpath);
	System.out.println("outpath: " + outpath);
	System.out.println("kmerlen: " + kmerlen);
	System.out.println("showpos: " + showpos);
	System.out.println("nummappers: " + numMappers);
	System.out.println("numreducers: " + numReducers);
	
	JobConf conf = new JobConf(MerReduce.class);
	conf.setNumMapTasks(numMappers);
	conf.setNumReduceTasks(numReducers);
		
	conf.addInputPath(new Path(inpath));;
	conf.set("KMER_LEN", Integer.toString(kmerlen));
	conf.set("SHOW_POS", Integer.toString(showpos));
	
	conf.setInputFormat(SequenceFileInputFormat.class);
		
	conf.setMapOutputKeyClass(BytesWritable.class);
	conf.setMapOutputValueClass(IntWritable.class);
	//conf.setCompressMapOutput(true);
			
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	conf.setOutputFormat(TextOutputFormat.class);
	
	conf.setMapperClass(MerMapClass.class);
	conf.setReducerClass(MerReduceClass.class);

	Path oPath = new Path(outpath);
	conf.setOutputPath(oPath);
	System.err.println("  Removing old results");
	FileSystem.get(conf).delete(oPath);
			
	conf.setJobName("CountMers");

	Timer t = new Timer();
	RunningJob rj = JobClient.runJob(conf);
	System.err.println("CountMers Finished");
	
	System.err.println("Total Running time was " + t.get());
	
	Counters counters = rj.getCounters( );
	Counters.Group task = counters.getGroup("org.apache.hadoop.mapred.Task$Counter");		
	long numDistinctMers = task.getCounter("REDUCE_INPUT_GROUPS");
	System.err.println("Num Distinct Mers: " + numDistinctMers);
}

Source File: UnionExample.java From tez with Apache License 2.0

4 votes

private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = DAG.create("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.setBoolean("mapred.mapper.new-api", false);
  inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
  DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

  Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
      UnionProcessor.class.getName()), 1);

  Configuration outputConf = new Configuration(tezConf);
  outputConf.setBoolean("mapred.reducer.new-api", false);
  outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
  checkerVertex.addDataSink("union", od);
  

  Configuration allPartsConf = new Configuration(tezConf);
  DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
      TextOutputFormat.class, outputPath + "-all-parts").build();
  checkerVertex.addDataSink("all-parts", od2);

  Configuration partsConf = new Configuration(tezConf);    
  DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
      TextOutputFormat.class, outputPath + "-parts").build();
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  unionVertex.addDataSink("parts", od1);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              InputDescriptor.create(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}

Source File: LinkRank.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * 
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb)
  throws IOException {

  // configure the counter job
  Path numLinksPath = new Path(webGraphDb, NUM_NODES);
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  JobConf counter = new NutchJob(getConf());
  counter.setJobName("LinkRank Counter");
  FileInputFormat.addInputPath(counter, nodeDb);
  FileOutputFormat.setOutputPath(counter, numLinksPath);
  counter.setInputFormat(SequenceFileInputFormat.class);
  counter.setMapperClass(Counter.class);
  counter.setCombinerClass(Counter.class);
  counter.setReducerClass(Counter.class);
  counter.setMapOutputKeyClass(Text.class);
  counter.setMapOutputValueClass(LongWritable.class);
  counter.setOutputKeyClass(Text.class);
  counter.setOutputValueClass(LongWritable.class);
  counter.setNumReduceTasks(1);
  counter.setOutputFormat(TextOutputFormat.class);
  counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the counter job, outputs to a single reduce task and file
  LOG.info("Starting link counter job");
  try {
    JobClient.runJob(counter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished link counter job");

  // read the first (and only) line from the file which should be the
  // number of links in the web graph
  LOG.info("Reading numlinks temp file");
  FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
  BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
  String numLinksLine = buffer.readLine();
  readLinks.close();
  
  // check if there are links to process, if none, webgraph might be empty
  if (numLinksLine == null || numLinksLine.length() == 0) {
    fs.delete(numLinksPath, true);
    throw new IOException("No links to process, is the webgraph empty?");
  }
  
  // delete temp file and convert and return the number of links as an int
  LOG.info("Deleting numlinks temp file");
  fs.delete(numLinksPath, true);
  String numLinks = numLinksLine.split("\\s+")[1];
  return Integer.parseInt(numLinks);
}

Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
  throws Exception {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("NodeDumper: starting at " + sdf.format(start));
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  Configuration conf = getConf();

  JobConf dumper = new NutchJob(conf);
  dumper.setJobName("NodeDumper: " + webGraphDb);
  FileInputFormat.addInputPath(dumper, nodeDb);
  dumper.setInputFormat(SequenceFileInputFormat.class);

  if (nameType == null) {
    dumper.setMapperClass(Sorter.class);
    dumper.setReducerClass(Sorter.class);
    dumper.setMapOutputKeyClass(FloatWritable.class);
    dumper.setMapOutputValueClass(Text.class);
  } else {
    dumper.setMapperClass(Dumper.class);
    dumper.setReducerClass(Dumper.class);
    dumper.setMapOutputKeyClass(Text.class);
    dumper.setMapOutputValueClass(FloatWritable.class);
  }

  dumper.setOutputKeyClass(Text.class);
  dumper.setOutputValueClass(FloatWritable.class);
  FileOutputFormat.setOutputPath(dumper, output);

  if (asSequenceFile) {
    dumper.setOutputFormat(SequenceFileOutputFormat.class);
  } else {
    dumper.setOutputFormat(TextOutputFormat.class);
  }

  dumper.setNumReduceTasks(1);
  dumper.setBoolean("inlinks", type == DumpType.INLINKS);
  dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
  dumper.setBoolean("scores", type == DumpType.SCORES);

  dumper.setBoolean("host", nameType == NameType.HOST);
  dumper.setBoolean("domain", nameType == NameType.DOMAIN);
  dumper.setBoolean("sum", aggrType == AggrType.SUM);
  dumper.setBoolean("max", aggrType == AggrType.MAX);

  dumper.setLong("topn", topN);

  // Set equals-sign as separator for Solr's ExternalFileField
  if (asEff) {
    dumper.set("mapred.textoutputformat.separator", "=");
  }

  try {
    LOG.info("NodeDumper: running");
    JobClient.runJob(dumper);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  long end = System.currentTimeMillis();
  LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: TestKeyFieldBasedComparator.java From hadoop-gpu with Apache License 2.0

4 votes

public void configure(String keySpec, int expect) throws Exception {
  Path testdir = new Path("build/test/test.mapred.spill");
  Path inDir = new Path(testdir, "in");
  Path outDir = new Path(testdir, "out");
  FileSystem fs = getFileSystem();
  fs.delete(testdir, true);
  conf.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(LongWritable.class);

  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(2);

  conf.setOutputFormat(TextOutputFormat.class);
  conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
  conf.setKeyFieldComparatorOptions(keySpec);
  conf.setKeyFieldPartitionerOptions("-k1.1,1.1");
  conf.set("map.output.key.field.separator", " ");
  conf.setMapperClass(InverseMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  if (!fs.mkdirs(testdir)) {
    throw new IOException("Mkdirs failed to create " + testdir.toString());
  }
  if (!fs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  // set up input data in 2 files 
  Path inFile = new Path(inDir, "part0");
  FileOutputStream fos = new FileOutputStream(inFile.toString());
  fos.write((line1 + "\n").getBytes());
  fos.write((line2 + "\n").getBytes());
  fos.close();
  JobClient jc = new JobClient(conf);
  RunningJob r_job = jc.submitJob(conf);
  while (!r_job.isComplete()) {
    Thread.sleep(1000);
  }
  
  if (!r_job.isSuccessful()) {
    fail("Oops! The job broke due to an unexpected error");
  }
  Path[] outputFiles = FileUtil.stat2Paths(
      getFileSystem().listStatus(outDir,
      new OutputLogFilter()));
  if (outputFiles.length > 0) {
    InputStream is = getFileSystem().open(outputFiles[0]);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    String line = reader.readLine();
    //make sure we get what we expect as the first line, and also
    //that we have two lines (both the lines must end up in the same
    //reducer since the partitioner takes the same key spec for all
    //lines
    if (expect == 1) {
      assertTrue(line.startsWith(line1));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line2));
    }
    line = reader.readLine();
    if (expect == 1) {
      assertTrue(line.startsWith(line2));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line1));
    }
    reader.close();
  }
}

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

4 votes

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
      LOG.info("CrawlDb db: " + crawlDb);
    }

    Path outFolder = new Path(output);
    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }

  }

org.apache.hadoop.mapred.TextOutputFormat Java Examples