org.apache.hadoop.mapreduce.lib.input.TextInputFormat Java Examples
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.TextInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMapReduceLazyOutput.java From hadoop with Apache License 2.0 | 6 votes |
private static void runTestLazyOutput(Configuration conf, Path output, int numReducers, boolean createLazily) throws Exception { Job job = Job.getInstance(conf, "Test-Lazy-Output"); FileInputFormat.setInputPaths(job, INPUT); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(TestMapReduceLazyOutput.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(numReducers); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (createLazily) { LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); } else { job.setOutputFormatClass(TextOutputFormat.class); } assertTrue(job.waitForCompletion(true)); }
Example #2
Source File: MRWordCount21.java From hadoop-book with Apache License 2.0 | 6 votes |
@Override public int run(String[] args) throws Exception { System.out.println("Running MR: MRWordCount21"); Job job = new Job(getConf()); job.setJarByClass(MRWordCount21.class); job.setJobName("MRWordCount21"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map21.class); job.setCombinerClass(Reduce21.class); job.setReducerClass(Reduce21.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); System.out.println("Input path: " + args[0]); System.out.println("Output path: " + args[1]); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
Example #3
Source File: BigDiffHadoop.java From secure-data-service with Apache License 2.0 | 6 votes |
public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "bigdiff"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath1)); FileInputFormat.addInputPath(job, new Path(inputPath2)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example #4
Source File: P2Q2.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P2Q2"); job.setJarByClass(P2Q2.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(P2Q2Map.class); job.setCombinerClass(P2Q2Reduce.class); job.setReducerClass(P2Q2Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example #5
Source File: PageRankDriver.java From flink-perf with Apache License 2.0 | 6 votes |
public static void assignInitialRanks (Configuration conf, FileSystem fs, String adjacencyPath, String initialPath, int numVertices) throws Exception { Path seqFile = new Path (initialPath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } Job job = Job.getInstance(conf); job.setJarByClass(InitialRankAssigner.class); job.setMapperClass(InitialRankAssigner.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Message.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Message.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(adjacencyPath)); FileOutputFormat.setOutputPath(job, seqFile); job.waitForCompletion(true); }
Example #6
Source File: P2Q3.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P2Q3"); job.setJarByClass(P2Q3.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(P2Q3Map.class); job.setCombinerClass(P2Q3Reduce.class); job.setReducerClass(P2Q3Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example #7
Source File: P1Q2.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P1Q2"); job.setJarByClass(P1Q2.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(P1Q2Map.class); job.setCombinerClass(P1Q2Reduce.class); job.setReducerClass(P1Q2Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example #8
Source File: UserNamePermission.java From hadoop with Apache License 2.0 | 6 votes |
public static void main(String [] args) throws Exception { Path outDir = new Path("output"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "user name check"); job.setJarByClass(UserNamePermission.class); job.setMapperClass(UserNamePermission.UserNameMapper.class); job.setCombinerClass(UserNamePermission.UserNameReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(UserNamePermission.UserNameReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, outDir); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #9
Source File: FieldSelectionMapper.java From hadoop with Apache License 2.0 | 6 votes |
public void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); this.fieldSeparator = conf.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t"); this.mapOutputKeyValueSpec = conf.get(FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:"); try { this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals( context.getInputFormatClass().getCanonicalName()); } catch (ClassNotFoundException e) { throw new IOException("Input format class not found", e); } allMapValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec( mapOutputKeyValueSpec, mapOutputKeyFieldList, mapOutputValueFieldList); LOG.info(FieldSelectionHelper.specToString(fieldSeparator, mapOutputKeyValueSpec, allMapValueFieldsFrom, mapOutputKeyFieldList, mapOutputValueFieldList) + "\nignoreInputKey:" + ignoreInputKey); }
Example #10
Source File: TestSpecificInputOutputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
@Before public void createParquetFile() throws Exception { final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TestSpecificInputOutputFormat.MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AvroParquetOutputFormat.class); AvroParquetOutputFormat.setOutputPath(job, parquetPath); AvroParquetOutputFormat.setSchema(job, Car.SCHEMA$); waitForJob(job); } }
Example #11
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
private void write(final Configuration conf, final Path inputPath, final Path parquetPath, Class<? extends Mapper> mapperClass, Class<? extends TBase<?, ?>> outputClass) throws IOException, Exception { final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(mapperClass); job.setNumReduceTasks(0); job.setOutputFormatClass(ParquetThriftOutputFormat.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP); ParquetThriftOutputFormat.setOutputPath(job, parquetPath); ParquetThriftOutputFormat.setThriftClass(job, outputClass); waitForJob(job); }
Example #12
Source File: TestFileSystemInput.java From envelope with Apache License 2.0 | 6 votes |
@Test public void readInputFormat() throws Exception { Map<String, Object> paramMap = new HashMap<>(); paramMap.put(FileSystemInput.FORMAT_CONFIG, "input-format"); paramMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(CSV_DATA).getPath()); paramMap.put(FileSystemInput.INPUT_FORMAT_TYPE_CONFIG, TextInputFormat.class.getCanonicalName()); paramMap.put("translator" + "." + ComponentFactory.TYPE_CONFIG_NAME, DummyInputFormatTranslator.class.getCanonicalName()); config = ConfigFactory.parseMap(paramMap); FileSystemInput formatInput = new FileSystemInput(); assertNoValidationFailures(formatInput, config); formatInput.configure(config); Dataset<Row> results = formatInput.read(); assertEquals("Invalid number of rows", 4, results.count()); assertEquals("Invalid first row result", 0L, results.first().getLong(0)); assertEquals("Invalid first row result", "One,Two,Three,Four", results.first().getString(1)); }
Example #13
Source File: P2Q1.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P2Q1"); job.setJarByClass(P2Q1.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(P2Q1Map.class); job.setCombinerClass(P2Q1Reduce.class); job.setReducerClass(P2Q1Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example #14
Source File: FailJob.java From big-c with Apache License 2.0 | 6 votes |
public Job createJob(boolean failMappers, boolean failReducers, Path inputFile) throws IOException { Configuration conf = getConf(); conf.setBoolean(FAIL_MAP, failMappers); conf.setBoolean(FAIL_REDUCE, failReducers); Job job = Job.getInstance(conf, "fail"); job.setJarByClass(FailJob.class); job.setMapperClass(FailMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(FailReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(TextInputFormat.class); job.setSpeculativeExecution(false); job.setJobName("Fail job"); FileInputFormat.addInputPath(job, inputFile); return job; }
Example #15
Source File: TestMapReduceLazyOutput.java From big-c with Apache License 2.0 | 6 votes |
private static void runTestLazyOutput(Configuration conf, Path output, int numReducers, boolean createLazily) throws Exception { Job job = Job.getInstance(conf, "Test-Lazy-Output"); FileInputFormat.setInputPaths(job, INPUT); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(TestMapReduceLazyOutput.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(numReducers); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (createLazily) { LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); } else { job.setOutputFormatClass(TextOutputFormat.class); } assertTrue(job.waitForCompletion(true)); }
Example #16
Source File: P1Q3.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P1Q3"); job.setJarByClass(P1Q3.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(P1Q3Map.class); //job.setCombinerClass(P1Q3Reduce.class); job.setReducerClass(P1Q3Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example #17
Source File: ReplicatedUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 4) { printUsage(); } Job job = new Job(conf, "ReduceSideJoin"); job.setJarByClass(ReplicatedUserJoin.class); // Use MultipleInputs to set which input uses what mapper // This will keep parsing of each data set separate from a logical // standpoint // The first two elements of the args array are the two inputs MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserJoinMapper.class); MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, CommentJoinMapper.class); job.getConfiguration().set("join.type", args[2]); job.setReducerClass(UserJoinReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(args[3])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 2; }
Example #18
Source File: SamplerJob.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(SamplerJob.class); ReservoirSamplerInputFormat.setInputFormat(job, TextInputFormat.class); ReservoirSamplerInputFormat.setNumSamples(job, 10); ReservoirSamplerInputFormat.setMaxRecordsToRead(job, 10000); ReservoirSamplerInputFormat. setUseSamplesNumberPerInputSplit(job, true); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); outputPath.getFileSystem(conf).delete(outputPath, true); if (job.waitForCompletion(true)) { return 0; } return 1; }
Example #19
Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0 | 5 votes |
@VisibleForTesting List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { int previewCount = previewBuffer.size(); TextInputFormat textInputFormat = new TextInputFormat(); long fileLength = fileStatus.getLen(); List<Map.Entry> batch = new ArrayList<>(); Path filePath = fileStatus.getPath(); // MR allows file length to be 0 for text data (not for avro) if (fileLength == 0) { LOG.info("File length is 0 for {}", filePath); return batch; } // Hadoop does unsafe casting from long to int, so split length should not be greater than int // max value long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength: Integer.MAX_VALUE; InputSplit fileSplit = new FileSplit(filePath, 0, splitLength, null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); try (RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext)) { recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); while (hasNext && batch.size() < batchSize && previewCount < batchSize) { batch.add(new Pair(filePath.toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances previewCount++; } } return batch; }
Example #20
Source File: JMatrixMultiplicationStep1.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1"); String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2"); String outputPath = ItemBasedCFDriver.path.get("step7OutputPath"); Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ","); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step1_Mapper.class); job.setJarByClass(JMatrixMultiplicationStep1.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path( inputPath2)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example #21
Source File: CalculateSimilarityStep2.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath = ItemBasedCFDriver.path.get("step2InputPath"); String outputPath = ItemBasedCFDriver.path.get("step2OutputPath"); Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ":"); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step2_Mapper.class); job.setReducerClass(Step2_Reducer.class); job.setCombinerClass(Step2_Reducer.class); job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber); job.setJarByClass(CalculateSimilarityStep2.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example #22
Source File: LouvainRunner.java From distributed-graph-analytics with Apache License 2.0 | 5 votes |
private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception { Configuration mrConf = new Configuration(); for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); } Job job = Job.getInstance(configuration); job.setJarByClass(LouvainRunner.class); Path in = new Path(inputPath); Path out = new Path(outputPath); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CommunityCompression"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LouvainVertexWritable.class); job.setMapperClass(CommunityCompression.Map.class); job.setReducerClass(CommunityCompression.Reduce.class); logger.debug("Running Mapreduce step with job configuration: {}", job); return job.waitForCompletion(false) ? 0 : 1; }
Example #23
Source File: JobContextImpl.java From tez with Apache License 2.0 | 5 votes |
/** * Get the {@link InputFormat} class for the job. * * @return the {@link InputFormat} class for the job. */ @SuppressWarnings("unchecked") public Class<? extends InputFormat<?,?>> getInputFormatClass() throws ClassNotFoundException { return (Class<? extends InputFormat<?,?>>) conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class); }
Example #24
Source File: CommunityCompression.java From distributed-graph-analytics with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { Configuration mrConf = this.getConf(); for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) { mrConf.set(entry.getKey(), entry.getValue()); } Job job = Job.getInstance(mrConf); job.setJarByClass(CommunityCompression.class); Path in = new Path(inputPath); Path out = new Path(outputPath); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("CommunityCompression"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LouvainVertexWritable.class); job.setMapperClass(CommunityCompression.Map.class); job.setReducerClass(CommunityCompression.Reduce.class); logger.debug("Running Mapreduce step with job configuration: {}", job); return job.waitForCompletion(false) ? 0 : 1; }
Example #25
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example #26
Source File: TestMRHelpers.java From incubator-tez with Apache License 2.0 | 5 votes |
private InputSplitInfo generateNewSplits(Path inputSplitsDir) throws Exception { JobConf jobConf = new JobConf(); jobConf.setUseNewMapper(true); jobConf.setClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class, InputFormat.class); jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString()); return MRHelpers.generateInputSplits(jobConf, inputSplitsDir); }
Example #27
Source File: WordCount.java From accumulo-examples with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Opts opts = new Opts(); opts.parseArgs(WordCount.class.getName(), args); // Create Accumulo table and attach Summing iterator try (AccumuloClient client = opts.createAccumuloClient()) { client.tableOperations().create(opts.tableName); IteratorSetting is = new IteratorSetting(10, SummingCombiner.class); SummingCombiner.setColumns(is, Collections.singletonList(new IteratorSetting.Column("count"))); SummingCombiner.setEncodingType(is, SummingCombiner.Type.STRING); client.tableOperations().attachIterator(opts.tableName, is); } catch (TableExistsException e) { // ignore } // Create M/R job Job job = Job.getInstance(opts.getHadoopConfig()); job.setJobName(WordCount.class.getName()); job.setJarByClass(WordCount.class); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, new Path(opts.inputDirectory)); job.setMapperClass(MapClass.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AccumuloOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); if (opts.hdfsPath != null) { AccumuloOutputFormat.configure().clientPropertiesPath(opts.hdfsPath) .defaultTable(opts.tableName).store(job); } else { AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties()) .defaultTable(opts.tableName).store(job); } System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #28
Source File: Phase3Step2DistinctDataJob.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase3Step2DistinctDataJob.class); job.setJobName(Phase3Step2DistinctDataJob.class.getName()); //mapper job.setMapperClass(RemoveRedundantDataMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); //reducer job.setReducerClass(RemoveRedundantDataReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; job.setInputFormatClass(TextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); //i/o paths FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
Example #29
Source File: CalculateSimilarityStep5.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath = ItemBasedCFDriver.path.get("step5InputPath"); String outputPath = ItemBasedCFDriver.path.get("step5OutputPath"); Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ":"); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step5_Mapper.class); job.setJarByClass(CalculateSimilarityStep5.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example #30
Source File: TestMRInputHelpers.java From tez with Apache License 2.0 | 5 votes |
private DataSourceDescriptor generateDataSourceDescriptorMapReduce(Path inputSplitsDir) throws Exception { JobConf jobConf = new JobConf(dfsCluster.getFileSystem().getConf()); jobConf.setUseNewMapper(true); jobConf.setClass(org.apache.hadoop.mapreduce.MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class, InputFormat.class); jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString()); return MRInputHelpers.configureMRInputWithLegacySplitGeneration(jobConf, inputSplitsDir, true); }