Java Code Examples for org.apache.hadoop.mapred.InputFormat#getRecordReader()
The following examples show how to use
org.apache.hadoop.mapred.InputFormat#getRecordReader() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: InputSampler.java From big-c with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 2
Source File: InputSampler.java From RDFS with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 3
Source File: InputSampler.java From RDFS with Apache License 2.0 | 6 votes |
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
Example 4
Source File: InputSampler.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
Example 5
Source File: InputSampler.java From big-c with Apache License 2.0 | 6 votes |
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
Example 6
Source File: InputSampler.java From hadoop with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 7
Source File: InputSampler.java From hadoop with Apache License 2.0 | 6 votes |
/** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { samples.add(key); key = reader.createKey(); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); }
Example 8
Source File: InputSampler.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.length); int splitStep = splits.length / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(key); key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 9
Source File: DelegatingInputFormat.java From big-c with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example 10
Source File: TestGroupedSplits.java From tez with Apache License 2.0 | 5 votes |
private static List<Text> readSplit(InputFormat<LongWritable,Text> format, InputSplit split, JobConf job) throws IOException { List<Text> result = new ArrayList<Text>(); RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, voidReporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); while (reader.next(key, value)) { result.add(value); value = reader.createValue(); } reader.close(); return result; }
Example 11
Source File: DelegatingInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example 12
Source File: TestInputOutputFormat.java From hive-dwrf with Apache License 2.0 | 5 votes |
@Test public void testEmptyFile() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.close(true); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); SerDe serde = new OrcSerde(); serde.initialize(conf, properties); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("hive.io.file.readcolumn.ids", "0,1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); assertEquals(false, reader.next(key, value)); reader.close(); assertEquals(null, serde.getSerDeStats()); }
Example 13
Source File: DelegatingInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example 14
Source File: DelegatingInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example 15
Source File: IOUtilFunctions.java From systemds with Apache License 2.0 | 5 votes |
/** * Counts the number of columns in a given collection of csv file splits. This primitive aborts * if a row with more than 0 columns is found and hence is robust against empty file splits etc. * * @param splits input splits * @param informat input format * @param job job configruation * @param delim delimiter * @return the number of columns in the collection of csv file splits * @throws IOException if IOException occurs */ @SuppressWarnings({ "rawtypes", "unchecked" }) public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, JobConf job, String delim ) throws IOException { LongWritable key = new LongWritable(); Text value = new Text(); int ncol = -1; for( int i=0; i<splits.length && ncol<=0; i++ ) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL); try { if( reader.next(key, value) ) { boolean hasValue = true; if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) ) hasValue = reader.next(key, value); if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) ) hasValue = reader.next(key, value); String row = value.toString().trim(); if( hasValue && !row.isEmpty() ) { ncol = IOUtilFunctions.countTokensCSV(row, delim); } } } finally { closeSilently(reader); } } return ncol; }
Example 16
Source File: InputSampler.java From hadoop with Apache License 2.0 | 4 votes |
/** * Randomize the split order, then take the specified number of keys from * each split sampled, where each key is selected with the specified * probability and possibly replaced by a subsequently selected key when * the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); LOG.debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, key); } freq *= (numSamples - 1) / (double) numSamples; } key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 17
Source File: HiveUtil.java From presto with Apache License 2.0 | 4 votes |
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) { // determine which hive columns we will read List<HiveColumnHandle> readColumns = columns.stream() .filter(column -> column.getColumnType() == REGULAR) .collect(toImmutableList()); // Projected columns are not supported here readColumns.forEach(readColumn -> checkArgument(readColumn.isBaseColumn(), "column %s is not a base column", readColumn.getName())); List<Integer> readHiveColumnIndexes = readColumns.stream() .map(HiveColumnHandle::getBaseHiveColumnIndex) .collect(toImmutableList()); // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files configuration = copy(configuration); setReadColumns(configuration, readHiveColumnIndexes); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true); JobConf jobConf = toJobConf(configuration); FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null); // propagate serialization configuration to getRecordReader schema.stringPropertyNames().stream() .filter(name -> name.startsWith("serialization.")) .forEach(name -> jobConf.set(name, schema.getProperty(name))); configureCompressionCodecs(jobConf); try { RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); int headerCount = getHeaderCount(schema); if (headerCount > 0) { Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue()); } int footerCount = getFooterCount(schema); if (footerCount > 0) { recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf); } return recordReader; } catch (IOException e) { if (e instanceof TextLineLengthLimitExceededException) { throw new PrestoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e); } throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e); } }
Example 18
Source File: InputSampler.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Randomize the split order, then take the specified number of keys from * each split sampled, where each key is selected with the specified * probability and possibly replaced by a subsequently selected key when * the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); LOG.debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, key); } freq *= (numSamples - 1) / (double) numSamples; } key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }
Example 19
Source File: FrameReaderTextCSV.java From systemds with Apache License 2.0 | 4 votes |
protected final int readCSVFrameFromInputSplit( InputSplit split, InputFormat<LongWritable,Text> informat, JobConf job, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first) throws IOException { boolean hasHeader = _props.hasHeader(); boolean isFill = _props.isFill(); double dfillValue = _props.getFillValue(); String sfillValue = String.valueOf(_props.getFillValue()); String delim = _props.getDelim(); //create record reader RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); int row = rl; int col = -1; //handle header if existing if(first && hasHeader ) { reader.next(key, value); //read header dest.setColumnNames(value.toString().split(delim)); } // Read the data boolean emptyValuesFound = false; try { while( reader.next(key, value) ) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; col = 0; String[] parts = IOUtilFunctions.splitCSV(cellStr, delim); //parse frame meta data (missing values / num distinct) if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX) ) { if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) ) for( int j=0; j<dest.getNumColumns(); j++ ) dest.getColumnMetadata(j).setMvValue(parts[j+1]); else if( parts[0].equals(TfUtils.TXMTD_NDPREFIX) ) for( int j=0; j<dest.getNumColumns(); j++ ) dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j+1])); continue; } for( String part : parts ) //foreach cell { part = part.trim(); if ( part.isEmpty() ) { if( isFill && dfillValue!=0 ) dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue)); emptyValuesFound = true; } else { dest.set(row, col, UtilFunctions.stringToObject(schema[col], part)); } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen); row++; } } finally { IOUtilFunctions.closeSilently(reader); } return row; }
Example 20
Source File: InputSampler.java From RDFS with Apache License 2.0 | 4 votes |
/** * Randomize the split order, then take the specified number of keys from * each split sampled, where each key is selected with the specified * probability and possibly replaced by a subsequently selected key when * the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException { InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); LOG.debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); ++i) { RecordReader<K,V> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); K key = reader.createKey(); V value = reader.createValue(); while (reader.next(key, value)) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(key); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, key); } freq *= (numSamples - 1) / (double) numSamples; } key = reader.createKey(); } } reader.close(); } return (K[])samples.toArray(); }