org.apache.hadoop.mapred.FileSplit Java Examples
The following examples show how to use
org.apache.hadoop.mapred.FileSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0 | 6 votes |
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader( InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException { // CombineFileSplit indicates the new export format which includes a manifest file if (inputSplit instanceof CombineFileSplit) { int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1); if (version != ExportManifestRecordWriter.FORMAT_VERSION) { throw new IOException("Unknown version: " + job.get(DynamoDBConstants .EXPORT_FORMAT_VERSION)); } return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter); } else if (inputSplit instanceof FileSplit) { // FileSplit indicates the old data pipeline format which doesn't include a manifest file Path path = ((FileSplit) inputSplit).getPath(); return new ImportRecordReader(job, path); } else { throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:" + " " + inputSplit.getClass()); } }
Example #2
Source File: OrcInputFormat.java From hive-dwrf with Apache License 2.0 | 6 votes |
@Override public RecordReader<NullWritable, OrcLazyRow> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { ReaderWriterProfiler.setProfilerOptions(conf); FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); reporter.setStatus(fileSplit.toString()); return new OrcRecordReader( OrcFile.createReader(fs, path, conf), conf, fileSplit.getStart(), fileSplit.getLength() ); }
Example #3
Source File: TestHiveFileFormats.java From presto with Apache License 2.0 | 6 votes |
@Test(dataProvider = "rowCount") public void testAvroFileInSymlinkTable(int rowCount) throws Exception { File file = File.createTempFile("presto_test", AVRO.name()); //noinspection ResultOfMethodCallIgnored file.delete(); try { FileSplit split = createTestFile(file.getAbsolutePath(), AVRO, HiveCompressionCodec.NONE, getTestColumnsSupportedByAvro(), rowCount); Properties splitProperties = new Properties(); splitProperties.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName()); splitProperties.setProperty(SERIALIZATION_LIB, AVRO.getSerDe()); testCursorProvider(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), split, splitProperties, getTestColumnsSupportedByAvro(), SESSION, rowCount); } finally { //noinspection ResultOfMethodCallIgnored file.delete(); } }
Example #4
Source File: LineDocRecordReader.java From RDFS with Apache License 2.0 | 6 votes |
/** * Constructor * @param job * @param split * @throws IOException */ public LineDocRecordReader(Configuration job, FileSplit split) throws IOException { long start = split.getStart(); long end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); InputStream in = fileIn; boolean skipFirstLine = false; if (start != 0) { skipFirstLine = true; // wait till BufferedInputStream to skip --start; fileIn.seek(start); } this.in = new BufferedInputStream(in); if (skipFirstLine) { // skip first line and re-establish "start". start += LineDocRecordReader.readData(this.in, null, EOL); } this.start = start; this.pos = start; this.end = end; }
Example #5
Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0 | 6 votes |
public ParsedRecordReader ( FileSplit split, Configuration conf, Class<? extends Parser> parser_class, Trees args ) throws IOException { start = split.getStart(); end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); fsin = fs.open(split.getPath()); try { parser = parser_class.newInstance(); } catch (Exception ex) { throw new Error("Unrecognized parser:"+parser_class); }; parser.initialize(args); parser.open(fsin,start,end); result = null; }
Example #6
Source File: EsHiveInputFormat.java From elasticsearch-hadoop with Apache License 2.0 | 6 votes |
@Override public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException { // first, merge input table properties (since there's no access to them ...) Settings settings = HadoopSettingsManager.loadFrom(job); //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES))); Log log = LogFactory.getLog(getClass()); // move on to initialization InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log); InitializationUtils.setUserProviderIfNotSet(settings, HadoopUserProvider.class, log); if (settings.getOutputAsJson() == false) { // Only set the fields if we aren't asking for raw JSON settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenate(HiveUtils.columnToAlias(settings), ",")); } HiveUtils.init(settings, log); // decorate original splits as FileSplit InputSplit[] shardSplits = super.getSplits(job, numSplits); FileSplit[] wrappers = new FileSplit[shardSplits.length]; Path path = new Path(job.get(HiveConstants.TABLE_LOCATION)); for (int i = 0; i < wrappers.length; i++) { wrappers[i] = new EsHiveSplit(shardSplits[i], path); } return wrappers; }
Example #7
Source File: StreamXmlRecordReader.java From RDFS with Apache License 2.0 | 6 votes |
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter, JobConf job, FileSystem fs) throws IOException { super(in, split, reporter, job, fs); beginMark_ = checkJobGet(CONF_NS + "begin"); endMark_ = checkJobGet(CONF_NS + "end"); maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000); lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_); synched_ = false; slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false); if (slowMatch_) { beginPat_ = makePatternCDataOrMark(beginMark_); endPat_ = makePatternCDataOrMark(endMark_); } init(); }
Example #8
Source File: MDSHiveLineInputFormat.java From multiple-dimension-spread with Apache License 2.0 | 6 votes |
@Override public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException { FileSplit fileSplit = (FileSplit)split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem( job ); long fileLength = fs.getLength( path ); long start = fileSplit.getStart(); long length = fileSplit.getLength(); InputStream in = fs.open( path ); IJobReporter jobReporter = new HadoopJobReporter( reporter ); jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) ); HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job ); if ( hiveConfig.isVectorMode() ){ IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig ); return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter ); } else{ return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter ); } }
Example #9
Source File: IOUtilFunctions.java From systemds with Apache License 2.0 | 6 votes |
public static InputSplit[] sortInputSplits(InputSplit[] splits) { if (splits[0] instanceof FileSplit) { // The splits do not always arrive in order by file name. // Sort the splits lexicographically by path so that the header will // be in the first split. // Note that we're assuming that the splits come in order by offset Arrays.sort(splits, new Comparator<InputSplit>() { @Override public int compare(InputSplit o1, InputSplit o2) { Path p1 = ((FileSplit) o1).getPath(); Path p2 = ((FileSplit) o2).getPath(); return p1.toString().compareTo(p2.toString()); } }); } return splits; }
Example #10
Source File: ExcelCellFileInputFormat.java From hadoopoffice with Apache License 2.0 | 6 votes |
@Override public RecordReader<Text, SpreadSheetCellDAO> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelCellRecordReader( (FileSplit) split,job,reporter); } catch (FormatNotUnderstoodException e) { // log LOGIF.error(e); } catch (GeneralSecurityException gse) { LOGIF.error(gse); } return null; }
Example #11
Source File: BackgroundHiveSplitLoader.java From presto with Apache License 2.0 | 5 votes |
private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory) throws IOException { ListenableFuture<?> lastResult = COMPLETED_FUTURE; for (InputSplit inputSplit : targetSplits) { Optional<InternalHiveSplit> internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit); if (internalHiveSplit.isPresent()) { lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get()); } if (stopped) { return COMPLETED_FUTURE; } } return lastResult; }
Example #12
Source File: WarcFileRecordReader.java From wikireverse with MIT License | 5 votes |
public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException { if (split instanceof FileSplit) { this.filePathList=new Path[1]; this.filePathList[0]=((FileSplit)split).getPath(); } else if (split instanceof MultiFileSplit) { this.filePathList=((MultiFileSplit)split).getPaths(); } else { throw new IOException("InputSplit is not a file split or a multi-file split - aborting"); } // Use FileSystem.get to open Common Crawl URIs using the S3 protocol. URI uri = filePathList[0].toUri(); this.fs = FileSystem.get(uri, conf); // get the total file sizes for (int i=0; i < filePathList.length; i++) { totalFileSize += fs.getFileStatus(filePathList[i]).getLen(); } Class<? extends CompressionCodec> codecClass=null; try { codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class); compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf); } catch (ClassNotFoundException cnfEx) { compressionCodec=null; LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec"); } openNextFile(); }
Example #13
Source File: TestHoodieRealtimeFileSplit.java From hudi with Apache License 2.0 | 5 votes |
@BeforeEach public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception { basePath = tempDir.toAbsolutePath().toString(); deltaLogPaths = Collections.singletonList(basePath + "/1.log"); fileSplitName = basePath + "/test.file"; baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {}); maxCommitTime = "10001"; split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime); }
Example #14
Source File: TestNewTextReader.java From dremio-oss with Apache License 2.0 | 5 votes |
@Test public void testFileNotFound() { FileSplit split = mock(FileSplit.class); when(split.getPath()).thenReturn(new org.apache.hadoop.fs.Path("/notExist/notExitFile")); TextParsingSettings settings = mock(TextParsingSettings.class); when(settings.isHeaderExtractionEnabled()).thenReturn(true); SchemaPath column = mock(SchemaPath.class); List<SchemaPath> columns = new ArrayList<>(1); columns.add(column); SabotContext context = mock(SabotContext.class); try (BufferAllocator allocator = allocatorRule.newAllocator("test-new-text-reader", 0, Long.MAX_VALUE)) { when(context.getAllocator()).thenReturn(allocator); OptionManager optionManager = mock(OptionManager.class); when(optionManager.getOption(ExecConstants.LIMIT_FIELD_SIZE_BYTES)) .thenReturn(ExecConstants.LIMIT_FIELD_SIZE_BYTES.getDefault().getNumVal()); when(optionManager.getOptionValidatorListing()).thenReturn(mock(OptionValidatorListing.class)); Path path = Path.of("/notExist"); try (BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE); OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, optionManager, 1000); FileSystem dfs = HadoopFileSystem.get(path, new Configuration(), null); SampleMutator mutator = new SampleMutator(sampleAllocator); CompliantTextRecordReader reader = new CompliantTextRecordReader(split, HadoopCompressionCodecFactory.DEFAULT, dfs, operatorContext, settings, columns); ) { reader.setup(mutator); } catch (Exception e) { // java.io.FileNotFoundException is expected, but memory leak is not expected. assertTrue(e.getCause() instanceof FileNotFoundException); } } }
Example #15
Source File: RegexIngestMapper.java From hadoop-solr with Apache License 2.0 | 5 votes |
@Override public LWDocument[] toDocuments(Writable key, Writable value, Reporter reporter, Configuration conf) throws IOException { if (key != null && value != null) { LWDocument doc = createDocument(key.toString() + "-" + System.currentTimeMillis(), null); Matcher matcher = regex.matcher(value.toString()); if (matcher != null) { if (match) { if (matcher.matches()) { processMatch(doc, matcher); } } else {// while (matcher.find()) { processMatch(doc, matcher); reporter.progress();//do we really even need this? } } } // Adding the file path where this record was taken FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String originalLogFilePath = fileSplit.getPath().toUri().getPath(); doc.addField(FIELD_PATH, originalLogFilePath); String docId = originalLogFilePath + "-" + doc.getId(); doc.setId(docId); return new LWDocument[] {doc}; } return null; }
Example #16
Source File: HadoopInputFormatTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCreateInputSplits() throws Exception { FileSplit[] result = new FileSplit[1]; result[0] = getFileSplit(); DummyInputFormat inputFormat = mock(DummyInputFormat.class); when(inputFormat.getSplits(any(JobConf.class), anyInt())).thenReturn(result); HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf()); hadoopInputFormat.createInputSplits(2); verify(inputFormat, times(1)).getSplits(any(JobConf.class), anyInt()); }
Example #17
Source File: NLineInputFormat.java From big-c with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #18
Source File: FileScanFramework.java From Bats with Apache License 2.0 | 5 votes |
@Override public ManagedReader<? extends SchemaNegotiator> next() { FileSplit split = fileFramework.nextSplit(); if (split == null) { return null; } return newReader(split); }
Example #19
Source File: LineIndexer.java From attic-apex-malhar with Apache License 2.0 | 5 votes |
public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit)reporter.getInputSplit(); String fileName = fileSplit.getPath().getName(); location.set(fileName); String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, location); } }
Example #20
Source File: NLineInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #21
Source File: DistCh.java From hadoop with Apache License 2.0 | 5 votes |
/** * Produce splits such that each is no greater than the quotient of the * total size and the number of splits requested. * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits ) throws IOException { final int srcCount = job.getInt(OP_COUNT_LABEL, -1); final int targetcount = srcCount / numSplits; String srclist = job.get(OP_LIST_LABEL, ""); if (srcCount < 0 || "".equals(srclist)) { throw new RuntimeException("Invalid metadata: #files(" + srcCount + ") listuri(" + srclist + ")"); } Path srcs = new Path(srclist); FileSystem fs = srcs.getFileSystem(job); List<FileSplit> splits = new ArrayList<FileSplit>(numSplits); Text key = new Text(); FileOperation value = new FileOperation(); long prev = 0L; int count = 0; //count src try (SequenceFile.Reader in = new SequenceFile.Reader(fs, srcs, job)) { for ( ; in.next(key, value); ) { long curr = in.getPosition(); long delta = curr - prev; if (++count > targetcount) { count = 0; splits.add(new FileSplit(srcs, prev, delta, (String[])null)); prev = curr; } } } long remaining = fs.getFileStatus(srcs).getLen() - prev; if (remaining != 0) { splits.add(new FileSplit(srcs, prev, remaining, (String[])null)); } LOG.info("numSplits=" + numSplits + ", splits.size()=" + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
Example #22
Source File: NLineInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #23
Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0 | 5 votes |
@Override public RecordReader<MRContainer, MRContainer> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { StormEvaluator.load_source_dir(); // load the parsed source parameters from a file String path = ((FileSplit)split).getPath().toString(); ParsedDataSource ds = (ParsedDataSource)DataSource.get(path,Plan.conf); return new ParsedRecordReader((FileSplit)split,job,ds.parser,(Trees)ds.args); }
Example #24
Source File: ExcelFileInputFormat.java From hadoopoffice with Apache License 2.0 | 5 votes |
@Override public RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelRecordReader( (FileSplit) split,job,reporter); } catch (FormatNotUnderstoodException e) { // log LOGIF.error(e); } catch (GeneralSecurityException gse) { LOGIF.error(gse); } return null; }
Example #25
Source File: JsonAccessor.java From pxf with Apache License 2.0 | 5 votes |
@Override protected Object getReader(JobConf conf, InputSplit split) throws IOException { if (!isEmpty(identifier)) { conf.set(JsonRecordReader.RECORD_MEMBER_IDENTIFIER, identifier); conf.setInt(JsonRecordReader.RECORD_MAX_LENGTH, maxRecordLength); return new JsonRecordReader(conf, (FileSplit) split); } else { return new LineRecordReader(conf, (FileSplit) split); } }
Example #26
Source File: WARCInputFormat.java From warc-hadoop with MIT License | 5 votes |
/** * Opens a WARC file (possibly compressed) for reading, and returns a RecordReader for accessing it. */ @Override public RecordReader<LongWritable, WARCWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new WARCReader(job, (FileSplit) split); }
Example #27
Source File: HiveTableFileInputFormat.java From flink with Apache License 2.0 | 5 votes |
@VisibleForTesting static FileSplit toHadoopFileSplit(FileInputSplit fileSplit) throws IOException { URI uri = fileSplit.getPath().toUri(); long length = fileSplit.getLength(); // Hadoop FileSplit should not have -1 length. if (length == -1) { length = fileSplit.getPath().getFileSystem().getFileStatus(fileSplit.getPath()).getLen() - fileSplit.getStart(); } return new FileSplit(new Path(uri), fileSplit.getStart(), length, (String[]) null); }
Example #28
Source File: NLineInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int) */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); for (FileStatus status : listStatus(job)) { for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : org.apache.hadoop.mapreduce.lib.input. NLineInputFormat.getSplitsForFile(status, job, N)) { splits.add(new FileSplit(split)); } } return splits.toArray(new FileSplit[splits.size()]); }
Example #29
Source File: NLineInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #30
Source File: InvertedIndex.java From hadoop-book with Apache License 2.0 | 5 votes |
public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String fileName = fileSplit.getPath().getName(); location.set(fileName); String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, location); } }