org.apache.hadoop.mapreduce.lib.input.TextInputFormat Java Exaples

Source File: TestMapReduceLazyOutput.java From hadoop with Apache License 2.0

6 votes

private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}

Source File: MRWordCount21.java From hadoop-book with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
    System.out.println("Running MR: MRWordCount21");
    Job job = new Job(getConf());
    job.setJarByClass(MRWordCount21.class);
    job.setJobName("MRWordCount21");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map21.class);
    job.setCombinerClass(Reduce21.class);
    job.setReducerClass(Reduce21.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    System.out.println("Input path: " + args[0]);
    System.out.println("Output path: " + args[1]);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

Source File: BigDiffHadoop.java From secure-data-service with Apache License 2.0

6 votes

public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "bigdiff");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath1));
    FileInputFormat.addInputPath(job, new Path(inputPath2));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);
}

Source File: P2Q2.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0

6 votes

public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q2");
	job.setJarByClass(P2Q2.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q2Map.class);
	job.setCombinerClass(P2Q2Reduce.class);
	job.setReducerClass(P2Q2Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}

Source File: PageRankDriver.java From flink-perf with Apache License 2.0

6 votes

public static void assignInitialRanks (Configuration conf, FileSystem fs, String adjacencyPath, String initialPath, int numVertices) throws Exception {
	Path seqFile = new Path (initialPath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setJarByClass(InitialRankAssigner.class);
	job.setMapperClass(InitialRankAssigner.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(Message.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(Message.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(adjacencyPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}

Source File: P2Q3.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0

6 votes

public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q3");
	job.setJarByClass(P2Q3.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q3Map.class);
	job.setCombinerClass(P2Q3Reduce.class);
	job.setReducerClass(P2Q3Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}

Source File: P1Q2.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0

6 votes

public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P1Q2");
	job.setJarByClass(P1Q2.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(DoubleWritable.class);

	job.setMapperClass(P1Q2Map.class);
	job.setCombinerClass(P1Q2Reduce.class);
	job.setReducerClass(P1Q2Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}

Source File: UserNamePermission.java From hadoop with Apache License 2.0

6 votes

public static void main(String [] args) throws Exception
{
  Path outDir = new Path("output");
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf, "user name check"); 
	
	
  job.setJarByClass(UserNamePermission.class);
  job.setMapperClass(UserNamePermission.UserNameMapper.class);
  job.setCombinerClass(UserNamePermission.UserNameReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
  job.setReducerClass(UserNamePermission.UserNameReducer.class);
  job.setNumReduceTasks(1);
    
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path("input"));
  FileOutputFormat.setOutputPath(job, outDir);
    
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: FieldSelectionMapper.java From hadoop with Apache License 2.0

6 votes

public void setup(Context context) 
    throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  this.fieldSeparator = 
    conf.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t");
  this.mapOutputKeyValueSpec = 
    conf.get(FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:");
  try {
    this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals(
      context.getInputFormatClass().getCanonicalName());
  } catch (ClassNotFoundException e) {
    throw new IOException("Input format class not found", e);
  }
  allMapValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec(
    mapOutputKeyValueSpec, mapOutputKeyFieldList, mapOutputValueFieldList);
  LOG.info(FieldSelectionHelper.specToString(fieldSeparator,
    mapOutputKeyValueSpec, allMapValueFieldsFrom, mapOutputKeyFieldList,
    mapOutputValueFieldList) + "\nignoreInputKey:" + ignoreInputKey);
}

Source File: TestSpecificInputOutputFormat.java From parquet-mr with Apache License 2.0

6 votes

@Before
public void createParquetFile() throws Exception {
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    final Job job = new Job(conf, "write");

    // input not really used
    TextInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TestSpecificInputOutputFormat.MyMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setOutputPath(job, parquetPath);
    AvroParquetOutputFormat.setSchema(job, Car.SCHEMA$);

    waitForJob(job);
  }
}

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

6 votes

private void write(final Configuration conf, final Path inputPath,
    final Path parquetPath, Class<? extends Mapper> mapperClass, Class<? extends TBase<?, ?>> outputClass) throws IOException, Exception {
  final Job job = new Job(conf, "write");

  // input not really used
  TextInputFormat.addInputPath(job, inputPath);
  job.setInputFormatClass(TextInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setNumReduceTasks(0);

  job.setOutputFormatClass(ParquetThriftOutputFormat.class);
  ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);
  ParquetThriftOutputFormat.setOutputPath(job, parquetPath);
  ParquetThriftOutputFormat.setThriftClass(job, outputClass);

  waitForJob(job);
}

Source File: TestFileSystemInput.java From envelope with Apache License 2.0

6 votes

@Test
public void readInputFormat() throws Exception {
  Map<String, Object> paramMap = new HashMap<>();
  paramMap.put(FileSystemInput.FORMAT_CONFIG, "input-format");
  paramMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(CSV_DATA).getPath());
  paramMap.put(FileSystemInput.INPUT_FORMAT_TYPE_CONFIG, TextInputFormat.class.getCanonicalName());
  paramMap.put("translator" + "." + ComponentFactory.TYPE_CONFIG_NAME,
      DummyInputFormatTranslator.class.getCanonicalName());
  config = ConfigFactory.parseMap(paramMap);

  FileSystemInput formatInput = new FileSystemInput();
  assertNoValidationFailures(formatInput, config);
  formatInput.configure(config);

  Dataset<Row> results = formatInput.read();

  assertEquals("Invalid number of rows", 4, results.count());
  assertEquals("Invalid first row result", 0L, results.first().getLong(0));
  assertEquals("Invalid first row result", "One,Two,Three,Four", results.first().getString(1));
}

Source File: P2Q1.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0

6 votes

public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q1");
	job.setJarByClass(P2Q1.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q1Map.class);
	job.setCombinerClass(P2Q1Reduce.class);
	job.setReducerClass(P2Q1Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}

Source File: FailJob.java From big-c with Apache License 2.0

6 votes

public Job createJob(boolean failMappers, boolean failReducers, Path inputFile) 
    throws IOException {
  Configuration conf = getConf();
  conf.setBoolean(FAIL_MAP, failMappers);
  conf.setBoolean(FAIL_REDUCE, failReducers);
  Job job = Job.getInstance(conf, "fail");
  job.setJarByClass(FailJob.class);
  job.setMapperClass(FailMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(FailReducer.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("Fail job");
  FileInputFormat.addInputPath(job, inputFile);
  return job;
}

Source File: TestMapReduceLazyOutput.java From big-c with Apache License 2.0

6 votes

private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}

Source File: P1Q3.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0

6 votes

public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P1Q3");
	job.setJarByClass(P1Q3.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(DoubleWritable.class);

	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(DoubleWritable.class);

	job.setMapperClass(P1Q3Map.class);
	//job.setCombinerClass(P1Q3Reduce.class);
	job.setReducerClass(P1Q3Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}

Source File: ReplicatedUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 4) {
		printUsage();
	}
	Job job = new Job(conf, "ReduceSideJoin");
	job.setJarByClass(ReplicatedUserJoin.class);

	// Use MultipleInputs to set which input uses what mapper
	// This will keep parsing of each data set separate from a logical
	// standpoint
	// The first two elements of the args array are the two inputs
	MultipleInputs.addInputPath(job, new Path(args[0]),
			TextInputFormat.class, UserJoinMapper.class);
	MultipleInputs.addInputPath(job, new Path(args[1]),
			TextInputFormat.class, CommentJoinMapper.class);
	job.getConfiguration().set("join.type", args[2]);

	job.setReducerClass(UserJoinReducer.class);

	job.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(job, new Path(args[3]));

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);

	return job.waitForCompletion(true) ? 0 : 2;
}

Source File: SamplerJob.java From hiped2 with Apache License 2.0

5 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(SamplerJob.class);

  ReservoirSamplerInputFormat.setInputFormat(job,
      TextInputFormat.class);

  ReservoirSamplerInputFormat.setNumSamples(job, 10);
  ReservoirSamplerInputFormat.setMaxRecordsToRead(job, 10000);
  ReservoirSamplerInputFormat.
      setUseSamplesNumberPerInputSplit(job, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}

Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0

5 votes

@VisibleForTesting
List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
    throws IOException, InterruptedException {
  int previewCount = previewBuffer.size();
  TextInputFormat textInputFormat = new TextInputFormat();
  long fileLength = fileStatus.getLen();
  List<Map.Entry> batch = new ArrayList<>();
  Path filePath = fileStatus.getPath();
  // MR allows file length to be 0 for text data (not for avro)
  if (fileLength == 0) {
    LOG.info("File length is 0 for {}", filePath);
    return batch;
  }
  // Hadoop does unsafe casting from long to int, so split length should not be greater than int
  // max value
  long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength: Integer.MAX_VALUE;
  InputSplit fileSplit = new FileSplit(filePath, 0, splitLength, null);
  TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf,
      TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));

  try (RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext)) {
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean hasNext = recordReader.nextKeyValue();

    while (hasNext && batch.size() < batchSize && previewCount < batchSize) {
      batch.add(new Pair(filePath.toUri().getPath() + "::" + recordReader.getCurrentKey(),
          String.valueOf(recordReader.getCurrentValue())));
      hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
      previewCount++;
    }
  }
  return batch;
}

Source File: JMatrixMultiplicationStep1.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
        InterruptedException {
    String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1");
    String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2");
    String outputPath = ItemBasedCFDriver.path.get("step7OutputPath");

    Configuration conf = new Configuration();
    conf.set("mapred.textoutputformat.separator", ",");

    Job job = Job.getInstance(conf);

    HDFS hdfs = new HDFS(conf);
    hdfs.rmr(outputPath);

    job.setMapperClass(Step1_Mapper.class);
    job.setJarByClass(JMatrixMultiplicationStep1.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path(
            inputPath2));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);

}

Source File: CalculateSimilarityStep2.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {

	String inputPath = ItemBasedCFDriver.path.get("step2InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step2OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step2_Mapper.class);
	job.setReducerClass(Step2_Reducer.class);
	job.setCombinerClass(Step2_Reducer.class);
	job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber);

	job.setJarByClass(CalculateSimilarityStep2.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(Text.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Source File: LouvainRunner.java From distributed-graph-analytics with Apache License 2.0

5 votes

private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

Source File: JobContextImpl.java From tez with Apache License 2.0

5 votes

/**
 * Get the {@link InputFormat} class for the job.
 * 
 * @return the {@link InputFormat} class for the job.
 */
@SuppressWarnings("unchecked")
public Class<? extends InputFormat<?,?>> getInputFormatClass() 
   throws ClassNotFoundException {
  return (Class<? extends InputFormat<?,?>>) 
    conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
}

Source File: CommunityCompression.java From distributed-graph-analytics with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
    Configuration mrConf = this.getConf();
    for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(mrConf);
    job.setJarByClass(CommunityCompression.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: TestMRHelpers.java From incubator-tez with Apache License 2.0

5 votes

private InputSplitInfo generateNewSplits(Path inputSplitsDir)
    throws Exception {
  JobConf jobConf = new JobConf();
  jobConf.setUseNewMapper(true);
  jobConf.setClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class,
      InputFormat.class);
  jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString());

  return MRHelpers.generateInputSplits(jobConf, inputSplitsDir);
}

Source File: WordCount.java From accumulo-examples with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
  Opts opts = new Opts();
  opts.parseArgs(WordCount.class.getName(), args);

  // Create Accumulo table and attach Summing iterator
  try (AccumuloClient client = opts.createAccumuloClient()) {
    client.tableOperations().create(opts.tableName);
    IteratorSetting is = new IteratorSetting(10, SummingCombiner.class);
    SummingCombiner.setColumns(is,
        Collections.singletonList(new IteratorSetting.Column("count")));
    SummingCombiner.setEncodingType(is, SummingCombiner.Type.STRING);
    client.tableOperations().attachIterator(opts.tableName, is);
  } catch (TableExistsException e) {
    // ignore
  }

  // Create M/R job
  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(WordCount.class.getName());
  job.setJarByClass(WordCount.class);
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.setInputPaths(job, new Path(opts.inputDirectory));

  job.setMapperClass(MapClass.class);
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(AccumuloOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Mutation.class);

  if (opts.hdfsPath != null) {
    AccumuloOutputFormat.configure().clientPropertiesPath(opts.hdfsPath)
        .defaultTable(opts.tableName).store(job);
  } else {
    AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties())
        .defaultTable(opts.tableName).store(job);
  }
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: Phase3Step2DistinctDataJob.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: CalculateSimilarityStep5.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step5InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step5OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");

	Job job = Job.getInstance(conf);
	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step5_Mapper.class);

	job.setJarByClass(CalculateSimilarityStep5.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Source File: TestMRInputHelpers.java From tez with Apache License 2.0

5 votes

private DataSourceDescriptor generateDataSourceDescriptorMapReduce(Path inputSplitsDir)
    throws Exception {
  JobConf jobConf = new JobConf(dfsCluster.getFileSystem().getConf());
  jobConf.setUseNewMapper(true);
  jobConf.setClass(org.apache.hadoop.mapreduce.MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class,
      InputFormat.class);
  jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString());

  return MRInputHelpers.configureMRInputWithLegacySplitGeneration(jobConf, inputSplitsDir, true);
}

org.apache.hadoop.mapreduce.lib.input.TextInputFormat Java Examples