Java Code Examples for org.apache.hadoop.mapreduce.lib.input.TextInputFormat#addInputPath()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.TextInputFormat#addInputPath() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
private void write(final Configuration conf, final Path inputPath, final Path parquetPath, Class<? extends Mapper> mapperClass, Class<? extends TBase<?, ?>> outputClass) throws IOException, Exception { final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(mapperClass); job.setNumReduceTasks(0); job.setOutputFormatClass(ParquetThriftOutputFormat.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP); ParquetThriftOutputFormat.setOutputPath(job, parquetPath); ParquetThriftOutputFormat.setThriftClass(job, outputClass); waitForJob(job); }
Example 2
Source File: ShapefileVectorInputFormatProvider.java From mrgeo with Apache License 2.0 | 6 votes |
@Override public void setupJob(Job job, ProviderProperties providerProperties) throws DataProviderException { super.setupJob(job, providerProperties); Configuration conf = job.getConfiguration(); String strBasePath = MrGeoProperties.getInstance().getProperty(MrGeoConstants.MRGEO_HDFS_VECTOR, "/mrgeo/vectors"); conf.set("hdfs." + MrGeoConstants.MRGEO_HDFS_VECTOR, strBasePath); for (String input : getContext().getInputs()) { try { // Set up native input format TextInputFormat.addInputPath(job, new Path(strBasePath, input)); } catch (IOException e) { throw new DataProviderException(e); } } }
Example 3
Source File: UserNamePermission.java From hadoop with Apache License 2.0 | 6 votes |
public static void main(String [] args) throws Exception { Path outDir = new Path("output"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "user name check"); job.setJarByClass(UserNamePermission.class); job.setMapperClass(UserNamePermission.UserNameMapper.class); job.setCombinerClass(UserNamePermission.UserNameReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(UserNamePermission.UserNameReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, outDir); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 4
Source File: TestSpecificInputOutputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
@Before public void createParquetFile() throws Exception { final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TestSpecificInputOutputFormat.MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AvroParquetOutputFormat.class); AvroParquetOutputFormat.setOutputPath(job, parquetPath); AvroParquetOutputFormat.setSchema(job, Car.SCHEMA$); waitForJob(job); } }
Example 5
Source File: UserNamePermission.java From big-c with Apache License 2.0 | 6 votes |
public static void main(String [] args) throws Exception { Path outDir = new Path("output"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "user name check"); job.setJarByClass(UserNamePermission.class); job.setMapperClass(UserNamePermission.UserNameMapper.class); job.setCombinerClass(UserNamePermission.UserNameReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(UserNamePermission.UserNameReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, outDir); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 6
Source File: NGramIngest.java From accumulo-examples with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Opts opts = new Opts(); opts.parseArgs(NGramIngest.class.getName(), args); Job job = Job.getInstance(opts.getHadoopConfig()); job.setJobName(NGramIngest.class.getSimpleName()); job.setJarByClass(NGramIngest.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(AccumuloOutputFormat.class); AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties()) .defaultTable(opts.tableName).store(job); job.setMapperClass(NGramMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); job.setNumReduceTasks(0); job.setSpeculativeExecution(false); try (AccumuloClient client = opts.createAccumuloClient()) { if (!client.tableOperations().exists(opts.tableName)) { log.info("Creating table " + opts.tableName); client.tableOperations().create(opts.tableName); SortedSet<Text> splits = new TreeSet<>(); String numbers[] = "1 2 3 4 5 6 7 8 9".split("\\s"); String lower[] = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split("\\s"); String upper[] = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split("\\s"); for (String[] array : new String[][] {numbers, lower, upper}) { for (String s : array) { splits.add(new Text(s)); } } client.tableOperations().addSplits(opts.tableName, splits); } } TextInputFormat.addInputPath(job, new Path(opts.inputDirectory)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 7
Source File: DeprecatedInputFormatTest.java From parquet-mr with Apache License 2.0 | 5 votes |
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException { final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { writeJob = new Job(conf, "write"); TextInputFormat.addInputPath(writeJob, inputPath); writeJob.setInputFormatClass(TextInputFormat.class); writeJob.setNumReduceTasks(0); ExampleOutputFormat.setCompression(writeJob, codec); ExampleOutputFormat.setOutputPath(writeJob, parquetPath); writeJob.setOutputFormatClass(ExampleOutputFormat.class); writeJob.setMapperClass(ReadMapper.class); ExampleOutputFormat.setSchema( writeJob, MessageTypeParser.parseMessageType( writeSchema)); writeJob.submit(); waitForJob(writeJob); } { jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema); jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName()); jobConf.setInputFormat(MyDeprecatedInputFormat.class); MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath); jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class); org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath); jobConf.setMapperClass(DeprecatedWriteMapper.class); jobConf.setNumReduceTasks(0); mapRedJob = JobClient.runJob(jobConf); } }
Example 8
Source File: AbstractMRNewApiSaveTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Parameters public static Collection<Object[]> configs() throws IOException { Configuration conf = HdpBootstrap.hadoopConfig(); HadoopCfgUtils.setGenericOptions(conf); Job job = new Job(conf); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(EsOutputFormat.class); job.setMapOutputValueClass(LinkedMapWritable.class); job.setMapperClass(TabMapper.class); job.setNumReduceTasks(0); Job standard = new Job(job.getConfiguration()); File fl = MRSuite.testData.sampleArtistsDatFile(); long splitSize = fl.length() / 3; TextInputFormat.setMaxInputSplitSize(standard, splitSize); TextInputFormat.setMinInputSplitSize(standard, 50); standard.setMapperClass(TabMapper.class); standard.setMapOutputValueClass(LinkedMapWritable.class); TextInputFormat.addInputPath(standard, new Path(MRSuite.testData.sampleArtistsDat(conf))); Job json = new Job(job.getConfiguration()); json.setMapperClass(Mapper.class); json.setMapOutputValueClass(Text.class); json.getConfiguration().set(ConfigurationOptions.ES_INPUT_JSON, "true"); TextInputFormat.addInputPath(json, new Path(MRSuite.testData.sampleArtistsJson(conf))); return Arrays.asList(new Object[][] { { standard, "" }, { json, "json-" } }); }
Example 9
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example 10
Source File: WordCount.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example 11
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example 12
Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: JobChainingDriver <posts> <users> <out>"); System.exit(2); } Path postInput = new Path(otherArgs[0]); Path userInput = new Path(otherArgs[1]); Path outputDirIntermediate = new Path(otherArgs[2] + "_int"); Path outputDir = new Path(otherArgs[2]); // Setup first job to counter user posts Job countingJob = new Job(conf, "JobChaining-Counting"); countingJob.setJarByClass(BasicJobChaining.class); // Set our mapper and reducer, we can use the API's long sum reducer for // a combiner! countingJob.setMapperClass(UserIdCountMapper.class); countingJob.setCombinerClass(LongSumReducer.class); countingJob.setReducerClass(UserIdSumReducer.class); countingJob.setOutputKeyClass(Text.class); countingJob.setOutputValueClass(LongWritable.class); countingJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(countingJob, postInput); countingJob.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate); // Execute job and grab exit code int code = countingJob.waitForCompletion(true) ? 0 : 1; if (code == 0) { // Calculate the average posts per user by getting counter values double numRecords = (double) countingJob.getCounters() .findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME) .getValue(); double numUsers = (double) countingJob.getCounters() .findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME) .getValue(); double averagePostsPerUser = numRecords / numUsers; // Setup binning job Job binningJob = new Job(new Configuration(), "JobChaining-Binning"); binningJob.setJarByClass(BasicJobChaining.class); // Set mapper and the average posts per user binningJob.setMapperClass(UserIdBinningMapper.class); UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser); binningJob.setNumReduceTasks(0); binningJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(binningJob, outputDirIntermediate); // Add two named outputs for below/above average MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(binningJob, true); TextOutputFormat.setOutputPath(binningJob, outputDir); // Add the user files to the DistributedCache FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput); for (FileStatus status : userFiles) { DistributedCache.addCacheFile(status.getPath().toUri(), binningJob.getConfiguration()); } // Execute job and grab exit code code = binningJob.waitForCompletion(true) ? 0 : 1; } // Clean up the intermediate output FileSystem.get(conf).delete(outputDirIntermediate, true); System.exit(code); }
Example 13
Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // Only have one reduce task so that all of the results from mapping are // processed in one place. job.setNumReduceTasks(1); // configure output TextOutputFormat.setOutputPath(job, intermediateOutputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int rc = job.waitForCompletion(true) ? 0 : 1; if (rc == 0) { Job topJob = Job.getInstance(getConf(), "Top Busy Airport"); // We want the task to run on a single VM topJob.setNumReduceTasks(1); // Set the inputs topJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(topJob, intermediateOutputPath); // Set the mapper and reducer topJob.setMapperClass(TopBusyAirportMapper.class); topJob.setReducerClass(TopBusyAirportReducer.class); // Set the outputs TextOutputFormat.setOutputPath(topJob, outputPath); topJob.setOutputFormatClass(TextOutputFormat.class); topJob.setOutputKeyClass(Text.class); topJob.setOutputValueClass(IntWritable.class); topJob.setMapOutputKeyClass(Text.class); topJob.setMapOutputValueClass(StringIntPair.class); rc = topJob.waitForCompletion(true) ? 0 : 1; } return rc; }
Example 14
Source File: TestMRJob.java From s3committer with Apache License 2.0 | 4 votes |
@Test public void testMRJob() throws Exception { FileSystem mockS3 = mock(FileSystem.class); FileSystem s3 = S3_OUTPUT_PATH.getFileSystem(getConfiguration()); if (s3 instanceof MockS3FileSystem) { ((MockS3FileSystem) s3).setMock(mockS3); } else { throw new RuntimeException("Cannot continue: S3 not mocked"); } String commitUUID = UUID.randomUUID().toString(); int numFiles = 3; Set<String> expectedFiles = Sets.newHashSet(); for (int i = 0; i < numFiles; i += 1) { File file = temp.newFile(String.valueOf(i) + ".text"); try (FileOutputStream out = new FileOutputStream(file)) { out.write(("file " + i).getBytes(StandardCharsets.UTF_8)); } expectedFiles.add(new Path( S3_OUTPUT_PATH, "part-m-0000" + i + "-" + commitUUID).toString()); } Job mrJob = Job.getInstance(MR_CLUSTER.getConfig(), "test-committer-job"); Configuration conf = mrJob.getConfiguration(); mrJob.setOutputFormatClass(S3TextOutputFormat.class); S3TextOutputFormat.setOutputPath(mrJob, S3_OUTPUT_PATH); File mockResultsFile = temp.newFile("committer.bin"); mockResultsFile.delete(); String committerPath = "file:" + mockResultsFile; conf.set("mock-results-file", committerPath); conf.set(UPLOAD_UUID, commitUUID); mrJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(mrJob, new Path("file:" + temp.getRoot().toString())); mrJob.setMapperClass(M.class); mrJob.setNumReduceTasks(0); mrJob.submit(); Assert.assertTrue("MR job should succeed", mrJob.waitForCompletion(true)); TestUtil.ClientResults results; try (ObjectInputStream in = new ObjectInputStream( FileSystem.getLocal(conf).open(new Path(committerPath)))) { results = (TestUtil.ClientResults) in.readObject(); } Assert.assertEquals("Should not delete files", 0, results.deletes.size()); Assert.assertEquals("Should not abort commits", 0, results.aborts.size()); Assert.assertEquals("Should commit task output files", numFiles, results.commits.size()); Set<String> actualFiles = Sets.newHashSet(); for (CompleteMultipartUploadRequest commit : results.commits) { actualFiles.add("s3://" + commit.getBucketName() + "/" + commit.getKey()); } Assert.assertEquals("Should commit the correct file paths", expectedFiles, actualFiles); }
Example 15
Source File: DelimitedVectorInputFormatProvider.java From mrgeo with Apache License 2.0 | 4 votes |
@Override public void setupJob(Job job, ProviderProperties providerProperties) throws DataProviderException { super.setupJob(job, providerProperties); Configuration conf = job.getConfiguration(); String strBasePath = MrGeoProperties.getInstance().getProperty(MrGeoConstants.MRGEO_HDFS_VECTOR, "/mrgeo/vectors"); conf.set("hdfs." + MrGeoConstants.MRGEO_HDFS_VECTOR, strBasePath); long featureCount = getContext().getFeatureCount(); int minFeaturesPerSplit = getContext().getMinFeaturesPerSplit(); boolean calcFeatureCount = (minFeaturesPerSplit > 0 && featureCount < 0); if (calcFeatureCount) { featureCount = 0L; } for (String input : getContext().getInputs()) { try { // Set up native input format TextInputFormat.addInputPath(job, new Path(strBasePath, input)); // Compute the number of features across all inputs if we don't already // have it in the context. if (calcFeatureCount) { VectorDataProvider dp = DataProviderFactory.getVectorDataProvider(input, AccessMode.READ, providerProperties); if (dp != null) { featureCount += dp.getVectorReader().count(); } } } catch (IOException e) { throw new DataProviderException(e); } } DelimitedVectorInputFormat.setupJob(job, getContext().getMinFeaturesPerSplit(), featureCount); }
Example 16
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(this.conf); for (Map.Entry<String, String> entry : extraConf.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { writeJob = new Job(conf, "write"); TextInputFormat.addInputPath(writeJob, inputPath); writeJob.setInputFormatClass(TextInputFormat.class); writeJob.setNumReduceTasks(0); ParquetOutputFormat.setCompression(writeJob, codec); ParquetOutputFormat.setOutputPath(writeJob, parquetPath); writeJob.setOutputFormatClass(ParquetOutputFormat.class); writeJob.setMapperClass(readMapperClass); ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class); GroupWriteSupport.setSchema( MessageTypeParser.parseMessageType(writeSchema), writeJob.getConfiguration()); writeJob.submit(); waitForJob(writeJob); } { conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema); readJob = new Job(conf, "read"); readJob.setInputFormatClass(ParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class); ParquetInputFormat.setInputPaths(readJob, parquetPath); readJob.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(readJob, outputPath); readJob.setMapperClass(writeMapperClass); readJob.setNumReduceTasks(0); readJob.submit(); waitForJob(readJob); } }
Example 17
Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testBasicBehaviorWithPadding() throws Exception { HadoopOutputFile.getBlockFileSystems().add("file"); File inputFile = temp.newFile(); FileOutputStream out = new FileOutputStream(inputFile); out.write(FILE_CONTENT.getBytes("UTF-8")); out.close(); File tempFolder = temp.newFolder(); tempFolder.delete(); Path tempPath = new Path(tempFolder.toURI()); File outputFolder = temp.newFile(); outputFolder.delete(); Configuration conf = new Configuration(); // May test against multiple hadoop versions conf.set("dfs.block.size", "1024"); conf.set("dfs.blocksize", "1024"); conf.set("dfs.blockSize", "1024"); conf.set("fs.local.block.size", "1024"); // don't use a cached FS with a different block size conf.set("fs.file.impl.disable.cache", "true"); // disable summary metadata, it isn't needed conf.set("parquet.enable.summary-metadata", "false"); conf.set("parquet.example.schema", PARQUET_TYPE.toString()); { Job writeJob = new Job(conf, "write"); writeJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString())); writeJob.setOutputFormatClass(ParquetOutputFormat.class); writeJob.setMapperClass(Writer.class); writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class); ParquetOutputFormat.setBlockSize(writeJob, 1024); ParquetOutputFormat.setPageSize(writeJob, 512); ParquetOutputFormat.setDictionaryPageSize(writeJob, 512); ParquetOutputFormat.setEnableDictionary(writeJob, true); ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad ParquetOutputFormat.setOutputPath(writeJob, tempPath); waitForJob(writeJob); } // make sure padding was added File parquetFile = getDataFile(tempFolder); ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : footer.getBlocks()) { Assert.assertTrue("Block should start at a multiple of the block size", block.getStartingPos() % 1024 == 0); } { Job readJob = new Job(conf, "read"); readJob.setInputFormatClass(NoSplits.class); ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class); TextInputFormat.addInputPath(readJob, tempPath); readJob.setOutputFormatClass(TextOutputFormat.class); readJob.setMapperClass(Reader.class); readJob.setNumReduceTasks(0); // write directly to text without reduce TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString())); waitForJob(readJob); } File dataFile = getDataFile(outputFolder); Assert.assertNotNull("Should find a data file", dataFile); StringBuilder contentBuilder = new StringBuilder(); for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) { contentBuilder.append(line); } String reconstructed = contentBuilder.toString(); Assert.assertEquals("Should match written file content", FILE_CONTENT, reconstructed); HadoopOutputFile.getBlockFileSystems().remove("file"); }
Example 18
Source File: TestInputFormatColumnProjection.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testProjectionSize() throws Exception { Assume.assumeTrue( // only run this test for Hadoop 2 org.apache.hadoop.mapreduce.JobContext.class.isInterface()); File inputFile = temp.newFile(); FileOutputStream out = new FileOutputStream(inputFile); out.write(FILE_CONTENT.getBytes("UTF-8")); out.close(); File tempFolder = temp.newFolder(); tempFolder.delete(); Path tempPath = new Path(tempFolder.toURI()); File outputFolder = temp.newFile(); outputFolder.delete(); Configuration conf = new Configuration(); // set the projection schema conf.set("parquet.read.schema", Types.buildMessage() .required(BINARY).as(UTF8).named("char") .named("FormatTestObject").toString()); // disable summary metadata, it isn't needed conf.set("parquet.enable.summary-metadata", "false"); conf.set("parquet.example.schema", PARQUET_TYPE.toString()); { Job writeJob = new Job(conf, "write"); writeJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString())); writeJob.setOutputFormatClass(ExampleOutputFormat.class); writeJob.setMapperClass(Writer.class); writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce ParquetOutputFormat.setBlockSize(writeJob, 10240); ParquetOutputFormat.setPageSize(writeJob, 512); ParquetOutputFormat.setDictionaryPageSize(writeJob, 1024); ParquetOutputFormat.setEnableDictionary(writeJob, true); ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad ParquetOutputFormat.setOutputPath(writeJob, tempPath); waitForJob(writeJob); } long bytesWritten = 0; FileSystem fs = FileSystem.getLocal(conf); for (FileStatus file : fs.listStatus(tempPath)) { bytesWritten += file.getLen(); } long bytesRead; { Job readJob = new Job(conf, "read"); readJob.setInputFormatClass(ExampleInputFormat.class); TextInputFormat.addInputPath(readJob, tempPath); readJob.setOutputFormatClass(TextOutputFormat.class); readJob.setMapperClass(Reader.class); readJob.setNumReduceTasks(0); // no reduce phase TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString())); waitForJob(readJob); bytesRead = Reader.bytesReadCounter.getValue(); } Assert.assertTrue("Should read less than 10% of the input file size", bytesRead < (bytesWritten / 10)); }
Example 19
Source File: WriteUsingMR.java From parquet-mr with Apache License 2.0 | 3 votes |
public Path write(Message... messages) throws Exception { synchronized (WriteUsingMR.class) { outputPath = TestUtils.someTemporaryFilePath(); Path inputPath = TestUtils.someTemporaryFilePath(); FileSystem fileSystem = inputPath.getFileSystem(conf); fileSystem.create(inputPath); inputMessages = Collections.unmodifiableList(Arrays.asList(messages)); final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(WritingMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(ProtoParquetOutputFormat.class); ProtoParquetOutputFormat.setOutputPath(job, outputPath); ProtoParquetOutputFormat.setProtobufClass(job, TestUtils.inferRecordsClass(messages)); waitForJob(job); inputMessages = null; return outputPath; } }
Example 20
Source File: MyWordCount.java From BigDataArchitect with Apache License 2.0 | 2 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(true); GenericOptionsParser parser = new GenericOptionsParser(conf, args); //工具类帮我们把-D 等等的属性直接set到conf,会留下commandOptions String[] othargs = parser.getRemainingArgs(); //让框架知道是windows异构平台运行 conf.set("mapreduce.app-submission.cross-platform","true"); // conf.set("mapreduce.framework.name","local"); // System.out.println(conf.get("mapreduce.framework.name")); Job job = Job.getInstance(conf); // FileInputFormat.setMinInputSplitSize(job,2222); // job.setInputFormatClass(ooxx.class); job.setJar("C:\\Users\\admin\\IdeaProjects\\msbhadoop\\target\\hadoop-hdfs-1.0-0.1.jar"); //必须必须写的 job.setJarByClass(MyWordCount.class); job.setJobName("mashibing"); Path infile = new Path(othargs[0]); TextInputFormat.addInputPath(job, infile); Path outfile = new Path(othargs[1]); if (outfile.getFileSystem(conf).exists(outfile)) outfile.getFileSystem(conf).delete(outfile, true); TextOutputFormat.setOutputPath(job, outfile); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReducer.class); // job.setNumReduceTasks(2); // Submit the job, then poll for progress until the job is complete job.waitForCompletion(true); }