Java Code Examples for org.apache.hadoop.mapred.RecordReader#createKey()
The following examples show how to use
org.apache.hadoop.mapred.RecordReader#createKey() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0 | 6 votes |
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException { int actualCount = 0; int totalCount = 0; InputSplit[] splits = inputFormat.getSplits(jobConf, 1); for (InputSplit split : splits) { RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null); NullWritable key = recordReader.createKey(); ArrayWritable writable = recordReader.createValue(); while (recordReader.next(key, writable)) { // writable returns an array with [field1, field2, _hoodie_commit_time, // _hoodie_commit_seqno] // Take the commit time and compare with the one we are interested in if (commit.equals((writable.get()[2]).toString())) { actualCount++; } totalCount++; } } assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg); assertEquals(totalExpected, totalCount, msg); }
Example 2
Source File: InputSampler.java From hadoop with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 3
Source File: InputSampler.java From big-c with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 4
Source File: InputSampler.java From big-c with Apache License 2.0 | 6 votes |
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
Example 5
Source File: MneMapredLongDataTest.java From mnemonic with Apache License 2.0 | 5 votes |
@Test(enabled = true, dependsOnMethods = {"testWriteLongData"}) public void testReadLongData() throws Exception { long sum = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit( new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>(); RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat.getRecordReader(split, m_conf, null); MneDurableInputValue<Long> mdval = null; NullWritable mdkey = reader.createKey(); while (true) { mdval = reader.createValue(); if (reader.next(mdkey, mdval)) { sum += mdval.getValue(); ++reccnt; } else { break; } } reader.close(); } } AssertJUnit.assertEquals(m_sum, sum); AssertJUnit.assertEquals(m_reccnt, reccnt); System.out.println(String.format("The checksum of long data is %d", sum)); }
Example 6
Source File: DumpTypedBytes.java From big-c with Apache License 2.0 | 5 votes |
/** * Dump given list of files to standard output as typed bytes. */ @SuppressWarnings("unchecked") private int dumpTypedBytes(List<FileStatus> files) throws IOException { JobConf job = new JobConf(getConf()); DataOutputStream dout = new DataOutputStream(System.out); AutoInputFormat autoInputFormat = new AutoInputFormat(); for (FileStatus fileStatus : files) { FileSplit split = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen() * fileStatus.getBlockSize(), (String[]) null); RecordReader recReader = null; try { recReader = autoInputFormat.getRecordReader(split, job, Reporter.NULL); Object key = recReader.createKey(); Object value = recReader.createValue(); while (recReader.next(key, value)) { if (key instanceof Writable) { TypedBytesWritableOutput.get(dout).write((Writable) key); } else { TypedBytesOutput.get(dout).write(key); } if (value instanceof Writable) { TypedBytesWritableOutput.get(dout).write((Writable) value); } else { TypedBytesOutput.get(dout).write(value); } } } finally { if (recReader != null) { recReader.close(); } } } dout.flush(); return 0; }
Example 7
Source File: DumpTypedBytes.java From hadoop with Apache License 2.0 | 5 votes |
/** * Dump given list of files to standard output as typed bytes. */ @SuppressWarnings("unchecked") private int dumpTypedBytes(List<FileStatus> files) throws IOException { JobConf job = new JobConf(getConf()); DataOutputStream dout = new DataOutputStream(System.out); AutoInputFormat autoInputFormat = new AutoInputFormat(); for (FileStatus fileStatus : files) { FileSplit split = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen() * fileStatus.getBlockSize(), (String[]) null); RecordReader recReader = null; try { recReader = autoInputFormat.getRecordReader(split, job, Reporter.NULL); Object key = recReader.createKey(); Object value = recReader.createValue(); while (recReader.next(key, value)) { if (key instanceof Writable) { TypedBytesWritableOutput.get(dout).write((Writable) key); } else { TypedBytesOutput.get(dout).write(key); } if (value instanceof Writable) { TypedBytesWritableOutput.get(dout).write((Writable) value); } else { TypedBytesOutput.get(dout).write(value); } } } finally { if (recReader != null) { recReader.close(); } } } dout.flush(); return 0; }
Example 8
Source File: MneMapredPersonDataTest.java From mnemonic with Apache License 2.0 | 5 votes |
@Test(enabled = true, dependsOnMethods = {"testWritePersonData"}) public void testReadPersonData() throws Exception { long sumage = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit( new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>(); RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat.getRecordReader(split, m_conf, null); MneDurableInputValue<Person<Long>> personval = null; NullWritable personkey = reader.createKey(); while (true) { personval = reader.createValue(); if (reader.next(personkey, personval)) { AssertJUnit.assertTrue(personval.getValue().getAge() < 51); sumage += personval.getValue().getAge(); ++reccnt; } else { break; } } reader.close(); } } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_sumage, sumage); System.out.println(String.format("The checksum of ages is %d", sumage)); }
Example 9
Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
private void doTestRowSerDe(boolean concurrencyChecks) throws Exception { getConnection(); Connection conn = startNetserverAndGetLocalNetConnection(); final long statTS = System.currentTimeMillis(); Statement st = conn.createStatement(); st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 5000 milliseconds"); String concurrency = "persistent ENABLE CONCURRENCY CHECKS"; st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) partition by primary key buckets 1 hdfsstore (myhdfs) " +(concurrencyChecks ? concurrency : "")); PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)"); ps.setInt(1, 1); ps.setString(2, "Value-1"); ps.execute(); //Wait for data to get to HDFS... String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1"); st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/")); assertEquals(1, list.length); conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1"); conf.set(RowInputFormat.HOME_DIR, HDFS_DIR); JobConf job = new JobConf(conf); job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); RowInputFormat ipformat = new RowInputFormat(); InputSplit[] splits = ipformat.getSplits(job, 2); assertEquals(1, splits.length); RecordReader<Key, Row> rr = ipformat.getRecordReader(splits[0], job, null); Key key = rr.createKey(); Row value = rr.createValue(); assertTrue(rr.next(key, value)); assertEquals(1, value.getRowAsResultSet().getInt(1)); assertEquals("Value-1", value.getRowAsResultSet().getString(2)); assertTrue(value.getTimestamp() > statTS); assertFalse(value.getRowAsResultSet().next()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); value.write(dos); dos.close(); byte[] buf = baos.toByteArray(); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf)); Row row = new Row(); row.readFields(dis); dis.close(); assertEquals(1, row.getRowAsResultSet().getInt(1)); assertEquals("Value-1", row.getRowAsResultSet().getString(2)); assertFalse(value.getRowAsResultSet().next()); TestUtil.shutDown(); }
Example 10
Source File: MneMapredChunkDataTest.java From mnemonic with Apache License 2.0 | 4 votes |
@Test(enabled = true, dependsOnMethods = {"testWriteChunkData"}) public void testReadChunkData() throws Exception { List<String> partfns = new ArrayList<String>(); long reccnt = 0L; long tsize = 0L; Checksum cs = new CRC32(); cs.reset(); File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { partfns.add(listfiles[idx].getName()); } } Collections.sort(partfns); // keep the order for checksum for (int idx = 0; idx < partfns.size(); ++idx) { System.out.println(String.format("Verifying : %s", partfns.get(idx))); FileSplit split = new FileSplit( new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>(); RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat.getRecordReader(split, m_conf, null); MneDurableInputValue<DurableChunk<?>> dchkval = null; NullWritable dchkkey = reader.createKey(); while (true) { dchkval = reader.createValue(); if (reader.next(dchkkey, dchkval)) { byte b; for (int j = 0; j < dchkval.getValue().getSize(); ++j) { b = unsafe.getByte(dchkval.getValue().get() + j); cs.update(b); } tsize += dchkval.getValue().getSize(); ++reccnt; } else { break; } } reader.close(); } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_totalsize, tsize); AssertJUnit.assertEquals(m_checksum, cs.getValue()); System.out.println(String.format("The checksum of chunk is %d", m_checksum)); }
Example 11
Source File: MultithreadedMapRunner.java From big-c with Apache License 2.0 | 4 votes |
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output, Reporter reporter) throws IOException { try { // allocate key & value instances these objects will not be reused // because execution of Mapper.map is not serialized. K1 key = input.createKey(); V1 value = input.createValue(); while (input.next(key, value)) { executorService.execute(new MapperInvokeRunable(key, value, output, reporter)); checkForExceptionsFromProcessingThreads(); // Allocate new key & value instances as mapper is running in parallel key = input.createKey(); value = input.createValue(); } if (LOG.isDebugEnabled()) { LOG.debug("Finished dispatching all Mappper.map calls, job " + job.getJobName()); } // Graceful shutdown of the Threadpool, it will let all scheduled // Runnables to end. executorService.shutdown(); try { // Now waiting for all Runnables to end. while (!executorService.awaitTermination(100, TimeUnit.MILLISECONDS)) { if (LOG.isDebugEnabled()) { LOG.debug("Awaiting all running Mappper.map calls to finish, job " + job.getJobName()); } // NOTE: while Mapper.map dispatching has concluded there are still // map calls in progress and exceptions would be thrown. checkForExceptionsFromProcessingThreads(); } // NOTE: it could be that a map call has had an exception after the // call for awaitTermination() returing true. And edge case but it // could happen. checkForExceptionsFromProcessingThreads(); } catch (IOException ioEx) { // Forcing a shutdown of all thread of the threadpool and rethrowing // the IOException executorService.shutdownNow(); throw ioEx; } catch (InterruptedException iEx) { throw new RuntimeException(iEx); } } finally { mapper.close(); } }
Example 12
Source File: PipesMapRunner.java From big-c with Apache License 2.0 | 4 votes |
/** * Run the map task. * @param input the set of inputs * @param output the object to collect the outputs of the map * @param reporter the object to update with status */ @SuppressWarnings("unchecked") public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output, Reporter reporter) throws IOException { Application<K1, V1, K2, V2> application = null; try { RecordReader<FloatWritable, NullWritable> fakeInput = (!Submitter.getIsJavaRecordReader(job) && !Submitter.getIsJavaMapper(job)) ? (RecordReader<FloatWritable, NullWritable>) input : null; application = new Application<K1, V1, K2, V2>(job, fakeInput, output, reporter, (Class<? extends K2>) job.getOutputKeyClass(), (Class<? extends V2>) job.getOutputValueClass()); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol<K1, V1> downlink = application.getDownlink(); boolean isJavaInput = Submitter.getIsJavaRecordReader(job); downlink.runMap(reporter.getInputSplit(), job.getNumReduceTasks(), isJavaInput); boolean skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false); try { if (isJavaInput) { // allocate key & value instances that are re-used for all entries K1 key = input.createKey(); V1 value = input.createValue(); downlink.setInputTypes(key.getClass().getName(), value.getClass().getName()); while (input.next(key, value)) { // map pair to output downlink.mapItem(key, value); if(skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { application.cleanup(); } }
Example 13
Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
private void deleteTest( boolean writeOnly, boolean primaryKey, boolean isTransactional) throws Exception { //getConnection(); Connection conn = null; if (isTransactional) { conn = getTxConnection();//startNetserverAndGetLocalNetConnection(); } else { conn = getConnection(); } Statement st = conn.createStatement(); st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 2000 milliseconds"); String primaryKeyString = primaryKey ? "primary key" : ""; st.execute("create table app.mytab1 (col1 int " + primaryKeyString + ", col2 varchar(100)) BUCKETS 1 persistent hdfsstore (myhdfs) " + (writeOnly? " WRITEONLY " : "")); PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)"); for(int i = 0; i < 3; i++) { ps.setInt(1, i); ps.setString(2, "Value-" + System.nanoTime()); ps.execute(); } st.execute("delete from mytab1 where col1 = 1"); //Wait for data to get to HDFS... String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1"); st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)"); TestUtil.shutDown(); FileStatus[] list = null; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); for (int i = 0; i < 20; i++) { list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/")); if (list.length == 1) { break; } Thread.sleep(500); } if (list.length != 1) { fail("unexpected files: " + java.util.Arrays.toString(list)); } conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1"); conf.set(RowInputFormat.HOME_DIR, HDFS_DIR); JobConf job = new JobConf(conf); job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); RowInputFormat ipformat = new RowInputFormat(); InputSplit[] splits = ipformat.getSplits(job, 2); RecordReader<Key, Row> rr = ipformat.getRecordReader(splits[0], job, null); Key key = rr.createKey(); Row value = rr.createValue(); rr.next(key, value); assertEquals(0, value.getRowAsResultSet().getInt("col1")); if (!writeOnly) { rr.next(key, value); checkForDeletedRow(value, primaryKey); } rr.next(key, value); assertEquals(1, value.getRowAsResultSet().getInt("col1")); rr.next(key, value); assertEquals(2, value.getRowAsResultSet().getInt("col1")); if (writeOnly) { rr.next(key, value); checkForDeletedRow(value, primaryKey); } assertFalse(rr.next(key, value)); TestUtil.shutDown(); }
Example 14
Source File: GenericHiveRecordCursor.java From presto with Apache License 2.0 | 4 votes |
public GenericHiveRecordCursor( Configuration configuration, Path path, RecordReader<K, V> recordReader, long totalBytes, Properties splitSchema, List<HiveColumnHandle> columns, DateTimeZone hiveStorageTimeZone) { requireNonNull(path, "path is null"); requireNonNull(recordReader, "recordReader is null"); checkArgument(totalBytes >= 0, "totalBytes is negative"); requireNonNull(splitSchema, "splitSchema is null"); requireNonNull(columns, "columns is null"); requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); this.path = path; this.recordReader = recordReader; this.totalBytes = totalBytes; this.key = recordReader.createKey(); this.value = recordReader.createValue(); this.hiveStorageTimeZone = hiveStorageTimeZone; this.deserializer = getDeserializer(configuration, splitSchema); this.rowInspector = getTableObjectInspector(deserializer); int size = columns.size(); this.types = new Type[size]; this.hiveTypes = new HiveType[size]; this.structFields = new StructField[size]; this.fieldInspectors = new ObjectInspector[size]; this.loaded = new boolean[size]; this.booleans = new boolean[size]; this.longs = new long[size]; this.doubles = new double[size]; this.slices = new Slice[size]; this.objects = new Object[size]; this.nulls = new boolean[size]; // initialize data columns for (int i = 0; i < columns.size(); i++) { HiveColumnHandle column = columns.get(i); checkState(column.getColumnType() == REGULAR, "column type must be regular"); types[i] = column.getType(); hiveTypes[i] = column.getHiveType(); StructField field = rowInspector.getStructFieldRef(column.getName()); structFields[i] = field; fieldInspectors[i] = field.getFieldObjectInspector(); } }
Example 15
Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public void testNoSecureHdfsCheck() throws Exception { getConnection(); Connection conn = startNetserverAndGetLocalNetConnection(); Statement st = conn.createStatement(); st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 5000 milliseconds"); st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) persistent hdfsstore (myhdfs) BUCKETS 1"); PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)"); int NUM_ENTRIES = 20; for(int i = 0; i < NUM_ENTRIES; i++) { ps.setInt(1, i); ps.setString(2, "Value-" + System.nanoTime()); ps.execute(); } //Wait for data to get to HDFS... String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1"); st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)"); stopNetServer(); FabricServiceManager.currentFabricServiceInstance().stop(new Properties()); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/")); assertEquals(1, list.length); conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1"); conf.set(RowInputFormat.HOME_DIR, HDFS_DIR); conf.set("hadoop.security.authentication", "kerberos"); JobConf job = new JobConf(conf); job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); RowInputFormat ipformat = new RowInputFormat(); InputSplit[] splits = ipformat.getSplits(job, 2); assertEquals(1, splits.length); CombineFileSplit split = (CombineFileSplit) splits[0]; assertEquals(1, split.getPaths().length); assertEquals(list[0].getPath().toString(), split.getPath(0).toString()); assertEquals(0, split.getOffset(0)); assertEquals(list[0].getLen(), split.getLength(0)); RecordReader<Key, Row> rr = ipformat.getRecordReader(split, job, null); Key key = rr.createKey(); Row value = rr.createValue(); int count = 0; while (rr.next(key, value)) { assertEquals(count++, value.getRowAsResultSet().getInt("col1")); } assertEquals(20, count); TestUtil.shutDown(); }
Example 16
Source File: TestHoodieCombineHiveInputFormat.java From hudi with Apache License 2.0 | 4 votes |
@Test @Disabled public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception { Configuration conf = new Configuration(); // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; final int numRecords = 1000; // Create 3 parquet files with 1000 records each File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime); InputFormatTestUtil.commit(tempDir, commitTime); // insert 1000 update records to log file 0 String newCommitTime = "101"; HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0); writer.close(); // insert 1000 update records to log file 1 writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime, numRecords, numRecords, 0); writer.close(); // insert 1000 update records to log file 2 writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, numRecords, numRecords, 0); writer.close(); TableDesc tblDesc = Utilities.defaultTd; // Set the input format tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class); PartitionDesc partDesc = new PartitionDesc(tblDesc, null); LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>(); pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc); MapredWork mrwork = new MapredWork(); mrwork.getMapWork().setPathToPartitionInfo(pt); Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString()); Utilities.setMapRedWork(conf, mrwork, mapWorkPath); jobConf = new JobConf(conf); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); jobConf.set(HAS_MAP_WORK, "true"); // The following config tells Hive to choose ExecMapper to read the MAP_WORK jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName()); // setting the split size to be 3 to create one split for 3 file groups jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3"); HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat(); String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double"; InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes); InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1); // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups assertEquals(1, splits.length); RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null); NullWritable nullWritable = recordReader.createKey(); ArrayWritable arrayWritable = recordReader.createValue(); int counter = 0; while (recordReader.next(nullWritable, arrayWritable)) { // read over all the splits counter++; } // should read out 3 splits, each for file0, file1, file2 containing 1000 records each assertEquals(3000, counter); }
Example 17
Source File: GFXDHiveInputFormatTest.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public void testHiveInputFormat() throws Exception { getConnection(); Connection conn = startNetserverAndGetLocalNetConnection(); Statement st = conn.createStatement(); st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 5000 milliseconds"); st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) persistent hdfsstore (myhdfs) BUCKETS 1"); PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)"); int NUM_ENTRIES = 20; for(int i = 0; i < NUM_ENTRIES; i++) { ps.setInt(1, i); ps.setString(2, "Value-" + System.nanoTime()); ps.execute(); } //Wait for data to get to HDFS... String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1"); st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/")); assertEquals(1, list.length); conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1"); conf.set(RowInputFormat.HOME_DIR, HDFS_DIR); JobConf job = new JobConf(conf); job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); GFXDHiveInputFormat ipformat = new GFXDHiveInputFormat(); InputSplit[] splits = ipformat.getSplits(job, 2); assertEquals(1, splits.length); GFXDHiveSplit split = (GFXDHiveSplit) splits[0]; assertEquals(1, split.getPaths().length); assertEquals(list[0].getPath().toString(), split.getPath(0).toString()); assertEquals(0, split.getOffset(0)); assertEquals(list[0].getLen(), split.getLength(0)); RecordReader<Key, Row> rr = ipformat.getRecordReader(split, job, null); assertTrue("Row record reader should be an instace of GFXDHiveRowRecordReader " + "but it is an instance of " + rr.getClass(), (rr instanceof GFXDHiveRowRecordReader)); Key key = rr.createKey(); Row value = rr.createValue(); int count = 0; while (rr.next(key, value)) { assertEquals(count++, value.getRowAsResultSet().getInt("col1")); } assertEquals(20, count); TestUtil.shutDown(); }
Example 18
Source File: InputSampler.java From hadoop with Apache License 2.0 | 4 votes |
/** * Randomize the split order, then take the specified number of keys from * each split sampled, where each key is selected with the specified * probability and possibly replaced by a subsequently selected key when * the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); LOG.debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, key); } freq *= (numSamples - 1) / (double) numSamples; } key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 19
Source File: MultithreadedMapRunner.java From hadoop with Apache License 2.0 | 4 votes |
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output, Reporter reporter) throws IOException { try { // allocate key & value instances these objects will not be reused // because execution of Mapper.map is not serialized. K1 key = input.createKey(); V1 value = input.createValue(); while (input.next(key, value)) { executorService.execute(new MapperInvokeRunable(key, value, output, reporter)); checkForExceptionsFromProcessingThreads(); // Allocate new key & value instances as mapper is running in parallel key = input.createKey(); value = input.createValue(); } if (LOG.isDebugEnabled()) { LOG.debug("Finished dispatching all Mappper.map calls, job " + job.getJobName()); } // Graceful shutdown of the Threadpool, it will let all scheduled // Runnables to end. executorService.shutdown(); try { // Now waiting for all Runnables to end. while (!executorService.awaitTermination(100, TimeUnit.MILLISECONDS)) { if (LOG.isDebugEnabled()) { LOG.debug("Awaiting all running Mappper.map calls to finish, job " + job.getJobName()); } // NOTE: while Mapper.map dispatching has concluded there are still // map calls in progress and exceptions would be thrown. checkForExceptionsFromProcessingThreads(); } // NOTE: it could be that a map call has had an exception after the // call for awaitTermination() returing true. And edge case but it // could happen. checkForExceptionsFromProcessingThreads(); } catch (IOException ioEx) { // Forcing a shutdown of all thread of the threadpool and rethrowing // the IOException executorService.shutdownNow(); throw ioEx; } catch (InterruptedException iEx) { throw new RuntimeException(iEx); } } finally { mapper.close(); } }
Example 20
Source File: PipesMapRunner.java From hadoop with Apache License 2.0 | 4 votes |
/** * Run the map task. * @param input the set of inputs * @param output the object to collect the outputs of the map * @param reporter the object to update with status */ @SuppressWarnings("unchecked") public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output, Reporter reporter) throws IOException { Application<K1, V1, K2, V2> application = null; try { RecordReader<FloatWritable, NullWritable> fakeInput = (!Submitter.getIsJavaRecordReader(job) && !Submitter.getIsJavaMapper(job)) ? (RecordReader<FloatWritable, NullWritable>) input : null; application = new Application<K1, V1, K2, V2>(job, fakeInput, output, reporter, (Class<? extends K2>) job.getOutputKeyClass(), (Class<? extends V2>) job.getOutputValueClass()); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol<K1, V1> downlink = application.getDownlink(); boolean isJavaInput = Submitter.getIsJavaRecordReader(job); downlink.runMap(reporter.getInputSplit(), job.getNumReduceTasks(), isJavaInput); boolean skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false); try { if (isJavaInput) { // allocate key & value instances that are re-used for all entries K1 key = input.createKey(); V1 value = input.createValue(); downlink.setInputTypes(key.getClass().getName(), value.getClass().getName()); while (input.next(key, value)) { // map pair to output downlink.mapItem(key, value); if(skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { application.cleanup(); } }