org.apache.hadoop.mapreduce.RecordReader#initialize

Source File: TestCombineTextInputFormat.java From big-c with Apache License 2.0

6 votes

private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
public void readEthereumBlockInputFormatBlock1346406GzipCompressed() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth1346406.bin.gz";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 1346406");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.nextKeyValue(),"Input Split for block 1346406 contains at least one block");
	key=reader.getCurrentKey();
	block=reader.getCurrentValue();
	assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
	assertFalse( reader.nextKeyValue(),"No further blocks in block 1346406");
	reader.close();
}

Source File: TestCRAMInputFormatOnHDFS.java From Hadoop-BAM with MIT License

6 votes

@Test
public void testReader() throws Exception {
  int expectedCount = 0;
  SamReader samReader = SamReaderFactory.makeDefault()
      .referenceSequence(new File(URI.create(reference))).open(new File(input));
  for (SAMRecord r : samReader) {
    expectedCount++;
  }

  CRAMInputFormat inputFormat = new CRAMInputFormat();
  List<InputSplit> splits = inputFormat.getSplits(jobContext);
  assertEquals(1, splits.size());
  RecordReader<LongWritable, SAMRecordWritable> reader = inputFormat
      .createRecordReader(splits.get(0), taskAttemptContext);
  reader.initialize(splits.get(0), taskAttemptContext);

  int actualCount = 0;
  while (reader.nextKeyValue()) {
    actualCount++;
  }

  assertEquals(expectedCount, actualCount);
}

Source File: TestCombineFileInputFormat.java From hadoop with Apache License 2.0

5 votes

@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
public void readEthereumBlockInputFormatBlock3510000to3510010() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth351000to3510010.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 3510000 .. 3510010");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	int count=0;
	while (count<11) {
		if (reader.nextKeyValue()) {
			count++;
		}
	}
	assertEquals(11,count,"Block 3510000 .. 3510010 contains 11 blocks");

	assertFalse( reader.nextKeyValue(),"No further blocks in block 3510000 .. 3510010");
	reader.close();
}

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Test with record length set to 0
 */
@Test (timeout=5000)
public void testZeroRecordLength() throws Exception {
  localFs.delete(workDir, true);
  Path file = new Path(workDir, new String("testFormat.txt"));
  createFile(file, null, 10, 10);
  Job job = Job.getInstance(defaultConf);
  // Set the fixed length record length config property 
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 0);
  FileInputFormat.setInputPaths(job, workDir);
  List<InputSplit> splits = format.getSplits(job);
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      TaskAttemptContext context =
          MapReduceTestUtil.createDummyMapTaskAttemptContext(
          job.getConfiguration());
      RecordReader<LongWritable, BytesWritable> reader = 
          format.createRecordReader(split, context);
      MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
          mcontext =
          new MapContextImpl<LongWritable, BytesWritable, LongWritable,
          BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
          reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
      reader.initialize(split, mcontext);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for zero record length:", exceptionThrown);
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test with record length set to 0
 */
@Test (timeout=5000)
public void testZeroRecordLength() throws Exception {
  localFs.delete(workDir, true);
  Path file = new Path(workDir, new String("testFormat.txt"));
  createFile(file, null, 10, 10);
  Job job = Job.getInstance(defaultConf);
  // Set the fixed length record length config property 
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 0);
  FileInputFormat.setInputPaths(job, workDir);
  List<InputSplit> splits = format.getSplits(job);
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      TaskAttemptContext context =
          MapReduceTestUtil.createDummyMapTaskAttemptContext(
          job.getConfiguration());
      RecordReader<LongWritable, BytesWritable> reader = 
          format.createRecordReader(split, context);
      MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
          mcontext =
          new MapContextImpl<LongWritable, BytesWritable, LongWritable,
          BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
          reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
      reader.initialize(split, mcontext);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for zero record length:", exceptionThrown);
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "us");
	conf.set("hadoopoffice.read.header.read", "true");
	conf.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	conf.set("hadoopoffice.read.header.column.names.regex","column");
	conf.set("hadoopoffice.read.header.column.names.replace", "spalte");
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "stax");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull(reader, "Format returned  null RecordReader");
	reader.initialize(splits.get(0), context);

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

Source File: TestCombineFileInputFormat.java From hadoop with Apache License 2.0

5 votes

@Test
public void testRecordReaderInit() throws InterruptedException, IOException {
  // Test that we properly initialize the child recordreader when
  // CombineFileInputFormat and CombineFileRecordReader are used.

  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf1 = new Configuration();
  conf1.set(DUMMY_KEY, "STATE1");
  TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1") };
  long [] lengths = { 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);

  RecordReader rr = inputFormat.createRecordReader(split, context1);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // Verify that the initial configuration is the one being used.
  // Right after construction the dummy key should have value "STATE1"
  assertEquals("Invalid initial dummy key value", "STATE1",
    rr.getCurrentKey().toString());

  // Switch the active context for the RecordReader...
  Configuration conf2 = new Configuration();
  conf2.set(DUMMY_KEY, "STATE2");
  TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
  rr.initialize(split, context2);

  // And verify that the new context is updated into the child record reader.
  assertEquals("Invalid secondary dummy key value", "STATE2",
    rr.getCurrentKey().toString());
}

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: TestCombineFileInputFormat.java From big-c with Apache License 2.0

5 votes

@Test
public void testRecordReaderInit() throws InterruptedException, IOException {
  // Test that we properly initialize the child recordreader when
  // CombineFileInputFormat and CombineFileRecordReader are used.

  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf1 = new Configuration();
  conf1.set(DUMMY_KEY, "STATE1");
  TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1") };
  long [] lengths = { 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);

  RecordReader rr = inputFormat.createRecordReader(split, context1);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // Verify that the initial configuration is the one being used.
  // Right after construction the dummy key should have value "STATE1"
  assertEquals("Invalid initial dummy key value", "STATE1",
    rr.getCurrentKey().toString());

  // Switch the active context for the RecordReader...
  Configuration conf2 = new Configuration();
  conf2.set(DUMMY_KEY, "STATE2");
  TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
  rr.initialize(split, context2);

  // And verify that the new context is updated into the child record reader.
  assertEquals("Invalid secondary dummy key value", "STATE2",
    rr.getCurrentKey().toString());
}

Source File: TestTableSnapshotInputFormat.java From hbase with Apache License 2.0

4 votes

private void verifyWithMockedMapReduce(Job job, int numRegions, int expectedNumSplits,
    byte[] startRow, byte[] stopRow)
    throws IOException, InterruptedException {
  TableSnapshotInputFormat tsif = new TableSnapshotInputFormat();
  List<InputSplit> splits = tsif.getSplits(job);

  Assert.assertEquals(expectedNumSplits, splits.size());

  HBaseTestingUtility.SeenRowTracker rowTracker = new HBaseTestingUtility.SeenRowTracker(startRow,
      stopRow.length > 0 ? stopRow : Bytes.toBytes("\uffff"));

  boolean localityEnabled =
      job.getConfiguration().getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY,
                                        SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT);

  for (int i = 0; i < splits.size(); i++) {
    // validate input split
    InputSplit split = splits.get(i);
    Assert.assertTrue(split instanceof TableSnapshotRegionSplit);
    TableSnapshotRegionSplit snapshotRegionSplit = (TableSnapshotRegionSplit) split;
    if (localityEnabled) {
      Assert.assertTrue(split.getLocations() != null && split.getLocations().length != 0);
    } else {
      Assert.assertTrue(split.getLocations() != null && split.getLocations().length == 0);
    }

    Scan scan =
        TableMapReduceUtil.convertStringToScan(snapshotRegionSplit.getDelegate().getScan());
    if (startRow.length > 0) {
      Assert.assertTrue(
        Bytes.toStringBinary(startRow) + " should <= " + Bytes.toStringBinary(scan.getStartRow()),
        Bytes.compareTo(startRow, scan.getStartRow()) <= 0);
    }
    if (stopRow.length > 0) {
      Assert.assertTrue(
        Bytes.toStringBinary(stopRow) + " should >= " + Bytes.toStringBinary(scan.getStopRow()),
        Bytes.compareTo(stopRow, scan.getStopRow()) >= 0);
    }
    Assert.assertTrue("startRow should < stopRow",
      Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) < 0);

    // validate record reader
    TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
    when(taskAttemptContext.getConfiguration()).thenReturn(job.getConfiguration());
    RecordReader<ImmutableBytesWritable, Result> rr =
        tsif.createRecordReader(split, taskAttemptContext);
    rr.initialize(split, taskAttemptContext);

    // validate we can read all the data back
    while (rr.nextKeyValue()) {
      byte[] row = rr.getCurrentKey().get();
      verifyRowFromMap(rr.getCurrentKey(), rr.getCurrentValue());
      rowTracker.addRow(row);
    }

    rr.close();
  }

  // validate all rows are seen
  rowTracker.validate();
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

4 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedPositiveLowFootprint()
		throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);

	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password
	conf.set("hadoopoffice.read.security.crypt.password", "test");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull(reader, "Format returned  null RecordReader");
	reader.initialize(splits.get(0), context);
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\"");
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

Source File: GraphSONInputFormat.java From tinkerpop with Apache License 2.0

4 votes

@Override
public RecordReader<NullWritable, VertexWritable> createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException {
    RecordReader<NullWritable, VertexWritable> reader = new GraphSONRecordReader();
    reader.initialize(split, context);
    return reader;
}

Source File: GFRecordReaderJUnitTest.java From gemfirexd-oss with Apache License 2.0

4 votes

public void testGFRecordReaderNHop1Split() throws Exception {
  cluster = super.initMiniCluster(CLUSTER_PORT, 1);
  
  int entryCount = 2;
  int bucketCount = 3;
  HashSet<String> keySet = new HashSet<String>();
  
  for (int j = 0; j < bucketCount; j++) {
    HdfsSortedOplogOrganizer bucket = new HdfsSortedOplogOrganizer(
        regionManager, j);
    ArrayList<TestEvent> items = new ArrayList<TestEvent>();
    for (int i = 0; i < entryCount; i++) {
      String key = "key - " + j + " : " + i;
      items.add(new TestEvent(key, ("value-" + System.nanoTime())));
      keySet.add(key);
    }
    bucket.flush(items.iterator(), entryCount);
  }
  
  assertEquals(entryCount * bucketCount, keySet.size());
  
  Configuration conf = hdfsStore.getFileSystem().getConf();
  GFInputFormat gfInputFormat = new GFInputFormat();
  Job job = Job.getInstance(conf, "test");
  
  conf = job.getConfiguration();
  conf.set(GFInputFormat.INPUT_REGION, getName());
  conf.set(GFInputFormat.HOME_DIR, testDataDir.getName());
  conf.setBoolean(GFInputFormat.CHECKPOINT, false);
  
  List<InputSplit> splits = gfInputFormat.getSplits(job);
  assertEquals(1, splits.size());
  
  CombineFileSplit split = (CombineFileSplit) splits.get(0);
  assertEquals(bucketCount, split.getNumPaths());
  
  TaskAttemptContext context = new TaskAttemptContextImpl(conf,
      new TaskAttemptID());
  RecordReader<GFKey, PersistedEventImpl> reader = gfInputFormat
      .createRecordReader(split, context);
  reader.initialize(split, context);
  
  while (reader.nextKeyValue()) {
    keySet.remove(reader.getCurrentKey().getKey());
  }
  assertEquals(0, keySet.size());
  
  reader.close();
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

4 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetLowFootPrint() throws IOException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013test.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	// set locale to the one of the test data
	conf.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	conf.set("hadoopoffice.read.lowFootprint", "true");
	conf.set("hadoopoffice.read.lowFootprint.parser", "stax");
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	List<InputSplit> splits = format.getSplits(job);
	assertEquals(1, splits.size(), "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull(reader, "Format returned  null RecordReader");
	reader.initialize(splits.get(0), context);
	Text spreadSheetKey = new Text();
	ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals("[excel2013test.xlsx]Sheet1!A1", spreadSheetKey.toString(),
			"Input Split for Excel file has keyname == \"[excel2013test.xlsx]Sheet1!A1\"");
	assertEquals(4, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 4 columns");
	assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
	assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
			"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
	assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
			"Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
	assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
	assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
	assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[3]).getFormattedValue(),
			"Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 2 with 1 column");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 2 with cell 1 == \"4\"");
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals(5, spreadSheetValue.get().length, "Input Split for Excel file contains row 3 with 5 columns");
	assertEquals("31/12/99", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
	assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 2 == \"5\"");
	assertNull(spreadSheetValue.get()[2], "Input Split for Excel file contains row 3 with cell 3 == null");
	assertNull(spreadSheetValue.get()[3], "Input Split for Excel file contains row 3 with cell 4 == null");
	assertEquals("null", ((SpreadSheetCellDAO) spreadSheetValue.get()[4]).getFormattedValue(),
			"Input Split for Excel file contains row 3 with cell 5 == \"null\"");
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 4 with 1 column");
	assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 4 with cell 1 == \"1\"");
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 5");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 5 with 3 columns");
	assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 1 == \"2\"");
	assertEquals("6", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 2== \"6\"");
	assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 5 with cell 3== \"10\"");
	assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 6");
	spreadSheetKey = reader.getCurrentKey();
	spreadSheetValue = reader.getCurrentValue();
	assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 6 with 3 columns");
	assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 1 == \"3\"");
	assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 2== \"4\"");
	assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
			"Input Split for Excel file contains row 6 with cell 3== \"15\"");
}

Source File: TestMRKeyValueTextInputFormat.java From big-c with Apache License 2.0

4 votes

@Test
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));
  Path file = new Path(workDir, "test.txt");

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 10000;
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 10) + 1) {

    LOG.debug("creating; entries = " + length);

    // create a file with length entries
    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
      LOG.debug("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.debug("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        assertEquals("reader class is KeyValueLineRecordReader.", 
          KeyValueLineRecordReader.class, clazz);
        MapContext<Text, Text, Text, Text> mcontext = 
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), 
          context.getTaskAttemptID(), reader, null, null, 
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            clazz = key.getClass();
            assertEquals("Key class is Text.", Text.class, clazz);
            value = reader.getCurrentValue();
            clazz = value.getClass();
            assertEquals("Value class is Text.", Text.class, clazz);
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + v);
            assertFalse("Key in multiple partitions.", bits.get(v));
            bits.set(v);
            count++;
          }
          LOG.debug("splits[" + j + "]=" + splits.get(j) +" count=" + count);
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}

Source File: TestCombineTextInputFormat.java From big-c with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));

  Random random = new Random();
  long seed = random.nextLong();
  LOG.info("seed = " + seed);
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with various lengths
  createFiles(length, numFiles, random);

  // create a combined split for the files
  CombineTextInputFormat format = new CombineTextInputFormat();
  for (int i = 0; i < 3; i++) {
    int numSplits = random.nextInt(length/20) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    LOG.debug("split= " + split);
    TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
    RecordReader<LongWritable, Text> reader =
      format.createRecordReader(split, context);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());
    MapContext<LongWritable,Text,LongWritable,Text> mcontext =
      new MapContextImpl<LongWritable,Text,LongWritable,Text>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);

    try {
      int count = 0;
      while (reader.nextKeyValue()) {
        LongWritable key = reader.getCurrentKey();
        assertNotNull("Key should not be null.", key);
        Text value = reader.getCurrentValue();
        final int v = Integer.parseInt(value.toString());
        LOG.debug("read " + v);
        assertFalse("Key in multiple partitions.", bits.get(v));
        bits.set(v);
        count++;
      }
      LOG.debug("split=" + split + " count=" + count);
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Source File: TestFastaInputFormat.java From Hadoop-BAM with MIT License

4 votes

@Test
public void testReader() throws Exception {
  FastaInputFormat inputFormat = new FastaInputFormat();
  List<InputSplit> splits = inputFormat.getSplits(jobContext);
  assertEquals(2, splits.size());
  RecordReader<Text, ReferenceFragment> reader = inputFormat
      .createRecordReader(splits.get(0), taskAttemptContext);
  reader.initialize(splits.get(0), taskAttemptContext);

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey());
  assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), reader.getCurrentValue().getSequence());

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), reader.getCurrentKey());
  assertEquals(new Text("ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), reader.getCurrentValue().getSequence());

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), reader.getCurrentKey());
  assertEquals(new Text("CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), reader.getCurrentValue().getSequence());

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), reader.getCurrentKey());
  assertEquals(new Text("TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), reader.getCurrentValue().getSequence());

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), reader.getCurrentKey());
  assertEquals(new Text("CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), reader.getCurrentValue().getSequence());

  assertFalse(reader.nextKeyValue());

  reader = inputFormat.createRecordReader(splits.get(1), taskAttemptContext);
  reader.initialize(splits.get(1), taskAttemptContext);

  assertTrue(reader.nextKeyValue());
  assertEquals(new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey());
  assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), reader.getCurrentValue().getSequence());

  assertFalse(reader.nextKeyValue());

  reader.close();
}

Java Code Examples for org.apache.hadoop.mapreduce.RecordReader#initialize()