org.apache.hadoop.mapred.FileInputFormat Java Exaples

Source File: TestFileSystem.java From hadoop-gpu with Apache License 2.0

6 votes

public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}

Source File: HiveAccumuloInputFormatTest.java From accumulo-hive-storage-manager with Apache License 2.0

6 votes

@Test
public void getNone() {
    FileInputFormat.addInputPath(conf, new Path("unused"));
    conf.set(AccumuloSerde.COLUMN_MAPPINGS, "cf|f1");
    try {
        InputSplit[] splits = inputformat.getSplits(conf, 0);
        assertEquals(splits.length, 1);
        RecordReader<Text,AccumuloHiveRow> reader = inputformat.getRecordReader(splits[0], conf, null);
        Text rowId = new Text("r1");
        AccumuloHiveRow row = new AccumuloHiveRow();
        row.setRowId("r1");
        assertFalse(reader.next(rowId, row));
    }catch (IOException e) {
        log.error(e);
        fail();
    }
    //for whatever reason, this errors unless I manually reset. mockKeyValues() ignored @BeforeTest
    conf.set(AccumuloSerde.COLUMN_MAPPINGS, "cf|name,cf|sid,cf|dgrs,cf|mills");
}

Source File: BaseAllocator.java From HiveKa with Apache License 2.0

6 votes

@Override
public InputSplit[] allocateWork(List<CamusRequest> requests,
    JobConf conf) throws IOException {
  int numTasks = conf.getInt("mapred.map.tasks", 30);
  
  reverseSortRequests(requests);

  List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>();
  Path[] tablePaths = FileInputFormat.getInputPaths(conf);

  for (int i = 0; i < numTasks; i++) {
    if (requests.size() > 0) {
      kafkaETLSplits.add(new KafkaSplit(tablePaths[0]));
    }
  }

  for (CamusRequest r : requests) {
    getSmallestMultiSplit(kafkaETLSplits).addRequest(r);
  }

  InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()];

  return kafkaETLSplits.toArray(inputSplits);
}

Source File: NNBench.java From RDFS with Apache License 2.0

6 votes

/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests(Configuration config) throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}

Source File: FrameReaderTextCell.java From systemds with Apache License 2.0

6 votes

protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}

Source File: Job.java From RDFS with Apache License 2.0

6 votes

/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

6 votes

@Override
public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
    Class<RecordReader<K, V>> rrClass) throws IOException {
  isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false"));
  if (isRealTime) {
    List<RecordReader> recordReaders = new LinkedList<>();
    ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
        + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
    for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      if (split.getPaths().length == 0) {
        continue;
      }
      FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
      recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
    }
    return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
  }
  return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
}

Source File: JobControlTestUtils.java From hadoop with Apache License 2.0

6 votes

/**
 * Creates a simple copy job.
 * 
 * @param indirs List of input directories.
 * @param outdir Output directory.
 * @return JobConf initialised for a simple copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception {

  Configuration defaults = new Configuration();
  JobConf theJob = new JobConf(defaults, TestJobControl.class);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0]));
  theJob.setMapperClass(DataCopy.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopy.class);
  theJob.setNumMapTasks(12);
  theJob.setNumReduceTasks(4);
  return theJob;
}

Source File: TestFileSystem.java From RDFS with Apache License 2.0

6 votes

public static void writeTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(DATA_DIR, true);
  fs.delete(WRITE_DIR, true);
  
  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);

  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(WriteMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, WRITE_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}

Source File: TestFileSystem.java From RDFS with Apache License 2.0

6 votes

public static void readTest(FileSystem fs, boolean fastCheck)
  throws Exception {

  fs.delete(READ_DIR, true);

  JobConf job = new JobConf(conf, TestFileSystem.class);
  job.setBoolean("fs.test.fastCheck", fastCheck);


  FileInputFormat.setInputPaths(job, CONTROL_DIR);
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(ReadMapper.class);
  job.setReducerClass(LongSumReducer.class);

  FileOutputFormat.setOutputPath(job, READ_DIR);
  job.setOutputKeyClass(UTF8.class);
  job.setOutputValueClass(LongWritable.class);
  job.setNumReduceTasks(1);
  JobClient.runJob(job);
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
			ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1");
   	reader.close();
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readEthereumBlockInputFormatBlock1346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
				ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1346406.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
		
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1346406 contains at least one block");
assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1346406");
   	reader.close();
}

Source File: LinkDbMerger.java From nutch-htmlunit with Apache License 2.0

6 votes

public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("LinkDb merge: starting at " + sdf.format(start));

  JobConf job = createMergeJob(getConf(), output, normalize, filter);
  for (int i = 0; i < dbs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));      
  }
  JobClient.runJob(job);
  FileSystem fs = FileSystem.get(getConf());
  fs.mkdirs(output);
  fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));

  long end = System.currentTimeMillis();
  LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: PersonVersion.java From blog with MIT License

6 votes

private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass,
                             Class<? extends Reducer> reduceClass) throws Exception {
    JobConf conf = new JobConf(PersonVersion.class);
    conf.setJobName(jobName);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(mapClass);
    conf.setCombinerClass(reduceClass);
    conf.setReducerClass(reduceClass);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    JobClient.runJob(conf);
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedPositiveLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

Source File: ReaderTextCSVParallel.java From systemds with Apache License 2.0

5 votes

private void readCSVMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, 
		MatrixBlock dest, long rlen, long clen, int blen, 
		boolean hasHeader, String delim, boolean fill, double fillValue) 
	throws IOException 
{
	FileInputFormat.addInputPath(job, path);
	TextInputFormat informat = new TextInputFormat();
	informat.configure(job);

	ExecutorService pool = CommonThreadPool.get(_numThreads);

	try 
	{
		// create read tasks for all splits
		ArrayList<CSVReadTask> tasks = new ArrayList<>();
		int splitCount = 0;
		for (InputSplit split : splits) {
			tasks.add( new CSVReadTask(split, _offsets, informat, job, dest, 
				rlen, clen, hasHeader, delim, fill, fillValue, splitCount++) );
		}
		pool.invokeAll(tasks);
		pool.shutdown();

		// check return codes and aggregate nnz
		long lnnz = 0;
		for (CSVReadTask rt : tasks) {
			lnnz += rt.getPartialNnz();
			if (!rt.getReturnCode()) {
				Exception err = rt.getException();
				throw new IOException("Read task for csv input failed: "+ err.toString(), err);
			}
		}
		dest.setNonZeros(lnnz);
	} 
	catch (Exception e) {
		throw new IOException("Threadpool issue, while parallel read.", e);
	}
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block403419.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
	Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
  
    assertEquals( 1, inputSplits.length,"Only one split generated for block 403419");
    	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.next(key,block),"Input Split for block 403419 contains at least one block");
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"f8b483dba2c3b7176a3da549ad41a48bb3121069",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069"
	);
	assertEquals(
			"08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322"
	);
    	assertFalse( reader.next(key,block),"No further lock 403419  in genesis Block");
    	
    	reader.close();
	
}

Source File: TestCopyOnWriteActionExecutor.java From hudi with Apache License 2.0

5 votes

private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
        throws Exception {
  // initialize parquet input format
  HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat();
  JobConf jobConf = new JobConf(hadoopConf);
  hoodieInputFormat.setConf(jobConf);
  HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
  setupIncremental(jobConf, startCommitTime, numCommitsToPull);
  FileInputFormat.setInputPaths(jobConf, Paths.get(basePath, partitionPath).toString());
  return hoodieInputFormat.listStatus(jobConf);
}

Source File: MapUtils.java From tez with Apache License 2.0

5 votes

private static InputSplit 
createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file, int numKVs)
    throws IOException {
  FileInputFormat.setInputPaths(job, workDir);

  LOG.info("Generating data at path: " + file);
  // create a file with length entries
  @SuppressWarnings("deprecation")
  SequenceFile.Writer writer = 
      SequenceFile.createWriter(fs, job, file, 
          LongWritable.class, Text.class);
  try {
    Random r = new Random(System.currentTimeMillis());
    LongWritable key = new LongWritable();
    Text value = new Text();
    for (int i = numKVs; i > 0; i--) {
      key.set(r.nextInt(1000));
      value.set(Integer.toString(i));
      writer.append(key, value);
      LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
    }
  } finally {
    writer.close();
  }
  
  SequenceFileInputFormat<LongWritable, Text> format = 
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(job, 1);
  System.err.println("#split = " + splits.length + " ; " +
      "#locs = " + splits[0].getLocations().length + "; " +
      "loc = " + splits[0].getLocations()[0] + "; " + 
      "off = " + splits[0].getLength() + "; " +
      "file = " + ((FileSplit)splits[0]).getPath());
  return splits[0];
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedPositiveLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
	assertEquals("[excel2003encrypt.xls]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2003encrypt.xls]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0

5 votes

@Test
public void testInputFormatLoad() throws IOException {
  // initial commit
  File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
  InputFormatTestUtil.commit(basePath, "100");

  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

  InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
  assertEquals(10, inputSplits.length);

  FileStatus[] files = inputFormat.listStatus(jobConf);
  assertEquals(10, files.length);
}

Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0

5 votes

@Test
public void testIncrementalSimple() throws IOException {
  // initial commit
  File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
  createCommitFile(basePath, "100", "2016/05/01");

  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

  InputFormatTestUtil.setupIncremental(jobConf, "100", 1);

  FileStatus[] files = inputFormat.listStatus(jobConf);
  assertEquals(0, files.length,
      "We should exclude commit 100 when returning incremental pull with start commit time as 100");
}

Source File: LinkRank.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * 
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output)
  throws IOException {

  // configure the initializer
  JobConf initializer = new NutchJob(getConf());
  initializer.setJobName("LinkAnalysis Initializer");
  FileInputFormat.addInputPath(initializer, nodeDb);
  FileOutputFormat.setOutputPath(initializer, output);
  initializer.setInputFormat(SequenceFileInputFormat.class);
  initializer.setMapperClass(Initializer.class);
  initializer.setMapOutputKeyClass(Text.class);
  initializer.setMapOutputValueClass(Node.class);
  initializer.setOutputKeyClass(Text.class);
  initializer.setOutputValueClass(Node.class);
  initializer.setOutputFormat(MapFileOutputFormat.class);
  initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the initializer
  LOG.info("Starting initialization job");
  try {
    JobClient.runJob(initializer);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished initialization job.");
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
 public void readEthereumBlockInputFormatBlock0to10() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth0to10.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
int count=0;
while (count<11) {
	if (reader.next(key,block)) {
		count++;
	}
}
assertEquals(11,count,"Block 0..10 contains 11 blocks");

   	assertFalse( reader.next(key,block),"No further blocks in block 0..10");
   	reader.close();
}

Source File: FrameReaderTextCSV.java From systemds with Apache License 2.0

5 votes

@Override
public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names,
		long rlen, long clen)
	throws IOException, DMLRuntimeException 
{
	//prepare file access
	JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
	Path path = new Path( fname );
	FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
	FileInputFormat.addInputPath(job, path);
	
	//check existence and non-empty file
	checkValidInputFile(fs, path); 
	
	//compute size if necessary
	if( rlen <= 0 || clen <= 0 ) {
		Pair<Integer,Integer> size = computeCSVSize(path, job, fs);
		rlen = size.getKey();
		clen = size.getValue();
	}
	
	//allocate output frame block
	ValueType[] lschema = createOutputSchema(schema, clen);
	String[] lnames = createOutputNames(names, clen);
	FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);

	//core read (sequential/parallel) 
	readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen);
	
	return ret;
}

Source File: TestMRAppWithCombiner.java From big-c with Apache License 2.0

5 votes

@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}

Source File: TestInputOutputFormat.java From hive-dwrf with Apache License 2.0

5 votes

@Test
public void testEmptyFile() throws Exception {
  JobConf job = new JobConf(conf);
  Properties properties = new Properties();
  HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
  FileSinkOperator.RecordWriter writer =
      outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
          properties, Reporter.NULL);
  writer.close(true);
  properties.setProperty("columns", "x,y");
  properties.setProperty("columns.types", "int:int");
  SerDe serde = new OrcSerde();
  serde.initialize(conf, properties);
  InputFormat<?,?> in = new OrcInputFormat();
  FileInputFormat.setInputPaths(conf, testFilePath.toString());
  InputSplit[] splits = in.getSplits(conf, 1);
  assertEquals(1, splits.length);

  // read the whole file
  conf.set("hive.io.file.readcolumn.ids", "0,1");
  org.apache.hadoop.mapred.RecordReader reader =
      in.getRecordReader(splits[0], conf, Reporter.NULL);
  Object key = reader.createKey();
  Object value = reader.createValue();
  assertEquals(0.0, reader.getProgress(), 0.00001);
  assertEquals(0, reader.getPos());
  assertEquals(false, reader.next(key, value));
  reader.close();
  assertEquals(null, serde.getSerDeStats());
}

Source File: LinkRank.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}

Source File: EmoInputFormat.java From emodb with Apache License 2.0

5 votes

@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
        throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(job);

    return FluentIterable.from(BaseInputFormat.getSplits(job, paths))
            .transform(_fromSplit)
            .toArray(InputSplit.class);
}

Source File: AvroUtils.java From ml-ease with Apache License 2.0

5 votes

/**
 * Obtain the avro input schema from data
 * @param conf
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException
{
  Path[] paths = FileInputFormat.getInputPaths(conf);
  if (paths == null)
  {
    throw new IllegalStateException("input paths do not exist in jobConf!");
  }
  Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  return inputSchema;
}

org.apache.hadoop.mapred.FileInputFormat Java Examples