org.apache.hadoop.mapreduce.lib.input.TextInputFormat#addInputPath

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

6 votes

private void write(final Configuration conf, final Path inputPath,
    final Path parquetPath, Class<? extends Mapper> mapperClass, Class<? extends TBase<?, ?>> outputClass) throws IOException, Exception {
  final Job job = new Job(conf, "write");

  // input not really used
  TextInputFormat.addInputPath(job, inputPath);
  job.setInputFormatClass(TextInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setNumReduceTasks(0);

  job.setOutputFormatClass(ParquetThriftOutputFormat.class);
  ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);
  ParquetThriftOutputFormat.setOutputPath(job, parquetPath);
  ParquetThriftOutputFormat.setThriftClass(job, outputClass);

  waitForJob(job);
}

Source File: ShapefileVectorInputFormatProvider.java From mrgeo with Apache License 2.0

6 votes

@Override
public void setupJob(Job job, ProviderProperties providerProperties) throws DataProviderException
{
  super.setupJob(job, providerProperties);
  Configuration conf = job.getConfiguration();
  String strBasePath = MrGeoProperties.getInstance().getProperty(MrGeoConstants.MRGEO_HDFS_VECTOR, "/mrgeo/vectors");
  conf.set("hdfs." + MrGeoConstants.MRGEO_HDFS_VECTOR, strBasePath);
  for (String input : getContext().getInputs())
  {
    try
    {
      // Set up native input format
      TextInputFormat.addInputPath(job, new Path(strBasePath, input));
    }
    catch (IOException e)
    {
      throw new DataProviderException(e);
    }
  }
}

Source File: UserNamePermission.java From hadoop with Apache License 2.0

6 votes

public static void main(String [] args) throws Exception
{
  Path outDir = new Path("output");
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf, "user name check"); 
	
	
  job.setJarByClass(UserNamePermission.class);
  job.setMapperClass(UserNamePermission.UserNameMapper.class);
  job.setCombinerClass(UserNamePermission.UserNameReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
  job.setReducerClass(UserNamePermission.UserNameReducer.class);
  job.setNumReduceTasks(1);
    
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path("input"));
  FileOutputFormat.setOutputPath(job, outDir);
    
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: TestSpecificInputOutputFormat.java From parquet-mr with Apache License 2.0

6 votes

@Before
public void createParquetFile() throws Exception {
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    final Job job = new Job(conf, "write");

    // input not really used
    TextInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TestSpecificInputOutputFormat.MyMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setOutputPath(job, parquetPath);
    AvroParquetOutputFormat.setSchema(job, Car.SCHEMA$);

    waitForJob(job);
  }
}

Source File: UserNamePermission.java From big-c with Apache License 2.0

6 votes

public static void main(String [] args) throws Exception
{
  Path outDir = new Path("output");
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf, "user name check"); 
	
	
  job.setJarByClass(UserNamePermission.class);
  job.setMapperClass(UserNamePermission.UserNameMapper.class);
  job.setCombinerClass(UserNamePermission.UserNameReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
  job.setReducerClass(UserNamePermission.UserNameReducer.class);
  job.setNumReduceTasks(1);
    
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path("input"));
  FileOutputFormat.setOutputPath(job, outDir);
    
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: NGramIngest.java From accumulo-examples with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
  Opts opts = new Opts();
  opts.parseArgs(NGramIngest.class.getName(), args);

  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(NGramIngest.class.getSimpleName());
  job.setJarByClass(NGramIngest.class);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(AccumuloOutputFormat.class);
  AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties())
      .defaultTable(opts.tableName).store(job);

  job.setMapperClass(NGramMapper.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Mutation.class);

  job.setNumReduceTasks(0);
  job.setSpeculativeExecution(false);

  try (AccumuloClient client = opts.createAccumuloClient()) {
    if (!client.tableOperations().exists(opts.tableName)) {
      log.info("Creating table " + opts.tableName);
      client.tableOperations().create(opts.tableName);
      SortedSet<Text> splits = new TreeSet<>();
      String numbers[] = "1 2 3 4 5 6 7 8 9".split("\\s");
      String lower[] = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split("\\s");
      String upper[] = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split("\\s");
      for (String[] array : new String[][] {numbers, lower, upper}) {
        for (String s : array) {
          splits.add(new Text(s));
        }
      }
      client.tableOperations().addSplits(opts.tableName, splits);
    }
  }

  TextInputFormat.addInputPath(job, new Path(opts.inputDirectory));
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: DeprecatedInputFormatTest.java From parquet-mr with Apache License 2.0

5 votes

private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }

Source File: AbstractMRNewApiSaveTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Parameters
public static Collection<Object[]> configs() throws IOException {
    Configuration conf = HdpBootstrap.hadoopConfig();
    HadoopCfgUtils.setGenericOptions(conf);

    Job job = new Job(conf);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(EsOutputFormat.class);
    job.setMapOutputValueClass(LinkedMapWritable.class);
    job.setMapperClass(TabMapper.class);
    job.setNumReduceTasks(0);


    Job standard = new Job(job.getConfiguration());
    File fl = MRSuite.testData.sampleArtistsDatFile();
    long splitSize = fl.length() / 3;
    TextInputFormat.setMaxInputSplitSize(standard, splitSize);
    TextInputFormat.setMinInputSplitSize(standard, 50);

    standard.setMapperClass(TabMapper.class);
    standard.setMapOutputValueClass(LinkedMapWritable.class);
    TextInputFormat.addInputPath(standard, new Path(MRSuite.testData.sampleArtistsDat(conf)));

    Job json = new Job(job.getConfiguration());
    json.setMapperClass(Mapper.class);
    json.setMapOutputValueClass(Text.class);
    json.getConfiguration().set(ConfigurationOptions.ES_INPUT_JSON, "true");
    TextInputFormat.addInputPath(json, new Path(MRSuite.testData.sampleArtistsJson(conf)));

    return Arrays.asList(new Object[][] {
            { standard, "" },
            { json, "json-" } });
}

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: WordCount.java From Flink-CEPplus with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

	if (otherArgs.length != 3) {
		System.err.println("Usage: JobChainingDriver <posts> <users> <out>");
		System.exit(2);
	}

	Path postInput = new Path(otherArgs[0]);
	Path userInput = new Path(otherArgs[1]);
	Path outputDirIntermediate = new Path(otherArgs[2] + "_int");
	Path outputDir = new Path(otherArgs[2]);

	// Setup first job to counter user posts
	Job countingJob = new Job(conf, "JobChaining-Counting");
	countingJob.setJarByClass(BasicJobChaining.class);

	// Set our mapper and reducer, we can use the API's long sum reducer for
	// a combiner!
	countingJob.setMapperClass(UserIdCountMapper.class);
	countingJob.setCombinerClass(LongSumReducer.class);
	countingJob.setReducerClass(UserIdSumReducer.class);

	countingJob.setOutputKeyClass(Text.class);
	countingJob.setOutputValueClass(LongWritable.class);

	countingJob.setInputFormatClass(TextInputFormat.class);

	TextInputFormat.addInputPath(countingJob, postInput);

	countingJob.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);

	// Execute job and grab exit code
	int code = countingJob.waitForCompletion(true) ? 0 : 1;

	if (code == 0) {
		// Calculate the average posts per user by getting counter values
		double numRecords = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME)
				.getValue();
		double numUsers = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME)
				.getValue();

		double averagePostsPerUser = numRecords / numUsers;

		// Setup binning job
		Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
		binningJob.setJarByClass(BasicJobChaining.class);

		// Set mapper and the average posts per user
		binningJob.setMapperClass(UserIdBinningMapper.class);
		UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser);

		binningJob.setNumReduceTasks(0);

		binningJob.setInputFormatClass(TextInputFormat.class);
		TextInputFormat.addInputPath(binningJob, outputDirIntermediate);

		// Add two named outputs for below/above average
		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME,
				TextOutputFormat.class, Text.class, Text.class);

		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME,
				TextOutputFormat.class, Text.class, Text.class);
		MultipleOutputs.setCountersEnabled(binningJob, true);

		TextOutputFormat.setOutputPath(binningJob, outputDir);

		// Add the user files to the DistributedCache
		FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
		for (FileStatus status : userFiles) {
			DistributedCache.addCacheFile(status.getPath().toUri(),
					binningJob.getConfiguration());
		}

		// Execute job and grab exit code
		code = binningJob.waitForCompletion(true) ? 0 : 1;
	}

	// Clean up the intermediate output
	FileSystem.get(conf).delete(outputDirIntermediate, true);

	System.exit(code);
}

Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // Only have one reduce task so that all of the results from mapping are
    // processed in one place.
    job.setNumReduceTasks(1);

    // configure output
    TextOutputFormat.setOutputPath(job, intermediateOutputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int rc = job.waitForCompletion(true) ? 0 : 1;
    if (rc == 0) {
      Job topJob = Job.getInstance(getConf(), "Top Busy Airport");

      // We want the task to run on a single VM
      topJob.setNumReduceTasks(1);

      // Set the inputs
      topJob.setInputFormatClass(TextInputFormat.class);
      TextInputFormat.addInputPath(topJob, intermediateOutputPath);

      // Set the mapper and reducer
      topJob.setMapperClass(TopBusyAirportMapper.class);
      topJob.setReducerClass(TopBusyAirportReducer.class);

      // Set the outputs
      TextOutputFormat.setOutputPath(topJob, outputPath);
      topJob.setOutputFormatClass(TextOutputFormat.class);
      topJob.setOutputKeyClass(Text.class);
      topJob.setOutputValueClass(IntWritable.class);

      topJob.setMapOutputKeyClass(Text.class);
      topJob.setMapOutputValueClass(StringIntPair.class);

      rc = topJob.waitForCompletion(true) ? 0 : 1;
    }
    return rc;
  }

Source File: TestMRJob.java From s3committer with Apache License 2.0

4 votes

@Test
public void testMRJob() throws Exception {
  FileSystem mockS3 = mock(FileSystem.class);
  FileSystem s3 = S3_OUTPUT_PATH.getFileSystem(getConfiguration());
  if (s3 instanceof MockS3FileSystem) {
    ((MockS3FileSystem) s3).setMock(mockS3);
  } else {
    throw new RuntimeException("Cannot continue: S3 not mocked");
  }

  String commitUUID = UUID.randomUUID().toString();

  int numFiles = 3;
  Set<String> expectedFiles = Sets.newHashSet();
  for (int i = 0; i < numFiles; i += 1) {
    File file = temp.newFile(String.valueOf(i) + ".text");
    try (FileOutputStream out = new FileOutputStream(file)) {
      out.write(("file " + i).getBytes(StandardCharsets.UTF_8));
    }
    expectedFiles.add(new Path(
        S3_OUTPUT_PATH, "part-m-0000" + i + "-" + commitUUID).toString());
  }

  Job mrJob = Job.getInstance(MR_CLUSTER.getConfig(), "test-committer-job");
  Configuration conf = mrJob.getConfiguration();

  mrJob.setOutputFormatClass(S3TextOutputFormat.class);
  S3TextOutputFormat.setOutputPath(mrJob, S3_OUTPUT_PATH);

  File mockResultsFile = temp.newFile("committer.bin");
  mockResultsFile.delete();
  String committerPath = "file:" + mockResultsFile;
  conf.set("mock-results-file", committerPath);
  conf.set(UPLOAD_UUID, commitUUID);

  mrJob.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(mrJob,
      new Path("file:" + temp.getRoot().toString()));

  mrJob.setMapperClass(M.class);
  mrJob.setNumReduceTasks(0);

  mrJob.submit();
  Assert.assertTrue("MR job should succeed", mrJob.waitForCompletion(true));

  TestUtil.ClientResults results;
  try (ObjectInputStream in = new ObjectInputStream(
      FileSystem.getLocal(conf).open(new Path(committerPath)))) {
    results = (TestUtil.ClientResults) in.readObject();
  }

  Assert.assertEquals("Should not delete files",
      0, results.deletes.size());

  Assert.assertEquals("Should not abort commits",
      0, results.aborts.size());

  Assert.assertEquals("Should commit task output files",
      numFiles, results.commits.size());

  Set<String> actualFiles = Sets.newHashSet();
  for (CompleteMultipartUploadRequest commit : results.commits) {
    actualFiles.add("s3://" + commit.getBucketName() + "/" + commit.getKey());
  }

  Assert.assertEquals("Should commit the correct file paths",
      expectedFiles, actualFiles);
}

Source File: DelimitedVectorInputFormatProvider.java From mrgeo with Apache License 2.0

4 votes

@Override
public void setupJob(Job job, ProviderProperties providerProperties) throws DataProviderException
{
  super.setupJob(job, providerProperties);
  Configuration conf = job.getConfiguration();
  String strBasePath = MrGeoProperties.getInstance().getProperty(MrGeoConstants.MRGEO_HDFS_VECTOR, "/mrgeo/vectors");
  conf.set("hdfs." + MrGeoConstants.MRGEO_HDFS_VECTOR, strBasePath);
  long featureCount = getContext().getFeatureCount();
  int minFeaturesPerSplit = getContext().getMinFeaturesPerSplit();
  boolean calcFeatureCount = (minFeaturesPerSplit > 0 && featureCount < 0);
  if (calcFeatureCount)
  {
    featureCount = 0L;
  }
  for (String input : getContext().getInputs())
  {
    try
    {
      // Set up native input format
      TextInputFormat.addInputPath(job, new Path(strBasePath, input));

      // Compute the number of features across all inputs if we don't already
      // have it in the context.
      if (calcFeatureCount)
      {
        VectorDataProvider dp = DataProviderFactory.getVectorDataProvider(input,
            AccessMode.READ, providerProperties);
        if (dp != null)
        {
          featureCount += dp.getVectorReader().count();
        }
      }
    }
    catch (IOException e)
    {
      throw new DataProviderException(e);
    }
  }
  DelimitedVectorInputFormat.setupJob(job, getContext().getMinFeaturesPerSplit(),
      featureCount);
}

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration(this.conf);
  for (Map.Entry<String, String> entry : extraConf.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    writeJob = new Job(conf, "write");
    TextInputFormat.addInputPath(writeJob, inputPath);
    writeJob.setInputFormatClass(TextInputFormat.class);
    writeJob.setNumReduceTasks(0);
    ParquetOutputFormat.setCompression(writeJob, codec);
    ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(readMapperClass);

    ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
    GroupWriteSupport.setSchema(
            MessageTypeParser.parseMessageType(writeSchema),
            writeJob.getConfiguration());
    writeJob.submit();
    waitForJob(writeJob);
  }
  {
    conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
    readJob = new Job(conf, "read");

    readJob.setInputFormatClass(ParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

    ParquetInputFormat.setInputPaths(readJob, parquetPath);
    readJob.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(readJob, outputPath);
    readJob.setMapperClass(writeMapperClass);
    readJob.setNumReduceTasks(0);
    readJob.submit();
    waitForJob(readJob);
  }
}

Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testBasicBehaviorWithPadding() throws Exception {
  HadoopOutputFile.getBlockFileSystems().add("file");

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // May test against multiple hadoop versions
  conf.set("dfs.block.size", "1024");
  conf.set("dfs.blocksize", "1024");
  conf.set("dfs.blockSize", "1024");
  conf.set("fs.local.block.size", "1024");

  // don't use a cached FS with a different block size
  conf.set("fs.file.impl.disable.cache", "true");

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
    ParquetOutputFormat.setBlockSize(writeJob, 1024);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  // make sure padding was added
  File parquetFile = getDataFile(tempFolder);
  ParquetMetadata footer = ParquetFileReader.readFooter(conf,
      new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
  for (BlockMetaData block : footer.getBlocks()) {
    Assert.assertTrue("Block should start at a multiple of the block size",
        block.getStartingPos() % 1024 == 0);
  }

  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(NoSplits.class);
    ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // write directly to text without reduce
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);
  }

  File dataFile = getDataFile(outputFolder);
  Assert.assertNotNull("Should find a data file", dataFile);

  StringBuilder contentBuilder = new StringBuilder();
  for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) {
    contentBuilder.append(line);
  }
  String reconstructed = contentBuilder.toString();
  Assert.assertEquals("Should match written file content",
      FILE_CONTENT, reconstructed);

  HadoopOutputFile.getBlockFileSystems().remove("file");
}

Source File: TestInputFormatColumnProjection.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testProjectionSize() throws Exception {
  Assume.assumeTrue( // only run this test for Hadoop 2
      org.apache.hadoop.mapreduce.JobContext.class.isInterface());

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // set the projection schema
  conf.set("parquet.read.schema", Types.buildMessage()
      .required(BINARY).as(UTF8).named("char")
      .named("FormatTestObject").toString());

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ExampleOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setBlockSize(writeJob, 10240);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 1024);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  long bytesWritten = 0;
  FileSystem fs = FileSystem.getLocal(conf);
  for (FileStatus file : fs.listStatus(tempPath)) {
    bytesWritten += file.getLen();
  }

  long bytesRead;
  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(ExampleInputFormat.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // no reduce phase
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);

    bytesRead = Reader.bytesReadCounter.getValue();
  }

  Assert.assertTrue("Should read less than 10% of the input file size",
      bytesRead < (bytesWritten / 10));
}

Source File: WriteUsingMR.java From parquet-mr with Apache License 2.0

3 votes

public Path write(Message... messages) throws Exception {

    synchronized (WriteUsingMR.class) {

      outputPath = TestUtils.someTemporaryFilePath();

      Path inputPath = TestUtils.someTemporaryFilePath();
      FileSystem fileSystem = inputPath.getFileSystem(conf);
      fileSystem.create(inputPath);

      inputMessages = Collections.unmodifiableList(Arrays.asList(messages));

      final Job job = new Job(conf, "write");

      // input not really used
      TextInputFormat.addInputPath(job, inputPath);
      job.setInputFormatClass(TextInputFormat.class);

      job.setMapperClass(WritingMapper.class);
      job.setNumReduceTasks(0);

      job.setOutputFormatClass(ProtoParquetOutputFormat.class);
      ProtoParquetOutputFormat.setOutputPath(job, outputPath);
      ProtoParquetOutputFormat.setProtobufClass(job, TestUtils.inferRecordsClass(messages));

      waitForJob(job);

      inputMessages = null;
      return outputPath;
    }
  }

Source File: MyWordCount.java From BigDataArchitect with Apache License 2.0

2 votes

public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration(true);

        GenericOptionsParser parser = new GenericOptionsParser(conf, args);  //工具类帮我们把-D 等等的属性直接set到conf，会留下commandOptions
        String[] othargs = parser.getRemainingArgs();

        //让框架知道是windows异构平台运行
        conf.set("mapreduce.app-submission.cross-platform","true");

//        conf.set("mapreduce.framework.name","local");
//        System.out.println(conf.get("mapreduce.framework.name"));

        Job job = Job.getInstance(conf);


//        FileInputFormat.setMinInputSplitSize(job,2222);
//        job.setInputFormatClass(ooxx.class);






        job.setJar("C:\\Users\\admin\\IdeaProjects\\msbhadoop\\target\\hadoop-hdfs-1.0-0.1.jar");
        //必须必须写的
        job.setJarByClass(MyWordCount.class);

        job.setJobName("mashibing");

        Path infile = new Path(othargs[0]);
        TextInputFormat.addInputPath(job, infile);

        Path outfile = new Path(othargs[1]);
        if (outfile.getFileSystem(conf).exists(outfile)) outfile.getFileSystem(conf).delete(outfile, true);
        TextOutputFormat.setOutputPath(job, outfile);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(MyReducer.class);

//        job.setNumReduceTasks(2);
        // Submit the job, then poll for progress until the job is complete
        job.waitForCompletion(true);

    }

Java Code Examples for org.apache.hadoop.mapreduce.lib.input.TextInputFormat#addInputPath()