org.apache.avro.mapred.FsInput Java Examples
The following examples show how to use
org.apache.avro.mapred.FsInput.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BinaryAvroSchemaFileReader.java From pxf with Apache License 2.0 | 6 votes |
@Override public Schema readSchema(Configuration configuration, String schemaName, HcfsType hcfsType, AvroUtilities.FileSearcher fileSearcher) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> fileReader = null; try { File file = fileSearcher.searchForFile(schemaName); if (file == null) { final Path path = new Path(hcfsType.getDataUri(configuration, schemaName)); FsInput inStream = new FsInput(path, configuration); fileReader = new DataFileReader<>(inStream, datumReader); } else { fileReader = new DataFileReader<>(file, datumReader); } return fileReader.getSchema(); } finally { if (fileReader != null) { fileReader.close(); } } }
Example #2
Source File: AvroFsHelper.java From incubator-gobblin with Apache License 2.0 | 6 votes |
/** * Returns an {@link DataFileReader} to the specified avro file. * <p> * Note: It is the caller's responsibility to close the returned {@link DataFileReader}. * </p> * * @param file The path to the avro file to open. * @return A {@link DataFileReader} for the specified avro file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ public DataFileReader<GenericRecord> getAvroFile(String file) throws FileBasedHelperException { try { if (!this.getFileSystem().exists(new Path(file))) { LOGGER.warn(file + " does not exist."); return null; } if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { return new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } return new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } }
Example #3
Source File: FileAwareInputStreamExtractorWithCheckSchema.java From incubator-gobblin with Apache License 2.0 | 6 votes |
/** * Use {@link AvroSchemaCheckStrategy} to make sure the real schema and the expected schema have matching field names and types * @param fsFromFile * @return * @throws IOException */ protected boolean schemaChecking(FileSystem fsFromFile) throws IOException { if( !this.state.getPropAsBoolean(CopySource.SCHEMA_CHECK_ENABLED, CopySource.DEFAULT_SCHEMA_CHECK_ENABLED) ) { return true; } DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader(new FsInput(this.file.getFileStatus().getPath(), new Configuration()), datumReader); Schema schema = dataFileReader.getSchema(); if(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA) == null) { throw new IOException("Expected schema is not set properly"); } Schema expectedSchema = new Schema.Parser().parse(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA)); AvroSchemaCheckStrategy strategy = AvroSchemaCheckStrategy.AvroSchemaCheckStrategyFactory.create(this.state); if(strategy == null) { throw new IOException("schema check strategy cannot be initialized"); } return strategy.compare(expectedSchema,schema); }
Example #4
Source File: AvroUtils.java From incubator-gobblin with Apache License 2.0 | 6 votes |
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; }
Example #5
Source File: AvroScanner.java From tajo with Apache License 2.0 | 6 votes |
/** * Initializes the AvroScanner. */ @Override public void init() throws IOException { if (targets == null) { targets = schema.toArray(); } prepareProjection(targets); outTuple = new VTuple(projectionMap.length); Schema avroSchema = AvroUtil.getAvroSchema(meta, conf); avroFields = avroSchema.getFields(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema); SeekableInput input = new FsInput(fragment.getPath(), conf); dataFileReader = new DataFileReader<>(input, datumReader); super.init(); }
Example #6
Source File: AvroParquetConvertCreator.java From datacollector with Apache License 2.0 | 6 votes |
@Override protected void addNecessaryJarsToJob(Configuration conf) { MapreduceUtils.addJarsToJob(conf, SemanticVersion.class, ParquetWriter.class, AvroParquetWriter.class, AvroParquetWriterBuilder190Int96.class, AvroSchemaConverter190Int96Avro18.class, FsInput.class, CompressionCodec.class, ParquetProperties.class, BytesInput.class, AvroToParquetConverterUtil.class, AvroLogicalTypeSupport.class ); }
Example #7
Source File: Purge.java From Cubert with Apache License 2.0 | 6 votes |
private DataFileReader<GenericRecord> createDataFileReader(String filename, boolean localFS) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> dataFileReader; if (localFS) { dataFileReader = new DataFileReader<GenericRecord>(new File(filename), datumReader); } else { Path path = new Path(filename); SeekableInput input = new FsInput(path, conf); dataFileReader = new DataFileReader<GenericRecord>(input, datumReader); } return dataFileReader; }
Example #8
Source File: AvroRecordReader.java From Bats with Apache License 2.0 | 5 votes |
private DataFileReader<GenericContainer> getReader(final Path hadoop, final FileSystem fs) throws ExecutionSetupException { try { final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName); return ugi.doAs((PrivilegedExceptionAction<DataFileReader<GenericContainer>>) () -> new DataFileReader<>(new FsInput(hadoop, fs.getConf()), new GenericDatumReader<GenericContainer>())); } catch (IOException | InterruptedException e) { throw new ExecutionSetupException( String.format("Error in creating avro reader for file: %s", hadoop), e); } }
Example #9
Source File: AvroRecordReader.java From spork with Apache License 2.0 | 5 votes |
@Override public void initialize(final InputSplit isplit, final TaskAttemptContext tc) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) isplit; start = fsplit.getStart(); end = fsplit.getStart() + fsplit.getLength(); DatumReader<GenericData.Record> datumReader = new GenericDatumReader<GenericData.Record>(schema); reader = DataFileReader.openReader( new FsInput(fsplit.getPath(), tc.getConfiguration()), datumReader); reader.sync(start); }
Example #10
Source File: AvroArrayReader.java From spork with Apache License 2.0 | 5 votes |
@Override public void initialize(final InputSplit isplit, final TaskAttemptContext tc) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) isplit; start = fsplit.getStart(); end = fsplit.getStart() + fsplit.getLength(); DatumReader<GenericData.Array<Object>> datumReader = new GenericDatumReader<GenericData.Array<Object>>(schema); reader = DataFileReader.openReader( new FsInput(fsplit.getPath(), tc.getConfiguration()), datumReader); reader.sync(start); }
Example #11
Source File: AvroTestTools.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Read all avro records in an HDFS location into a map from file name to {@link RecordIterator}. */ @Override public TreeMap<String, RecordIterator> readAllRecordsInBinaryDirectory(FileSystem fs, Path path) throws IOException { TreeMap<String, RecordIterator> output = new TreeMap<>(); if (!fs.exists(path)) { return output; } PathFilter pathFilter = new HiddenFilter(); for (FileStatus status : FileListUtils.listFilesRecursively(fs, path, pathFilter)) { SeekableInput sin = new FsInput(status.getPath(), fs); DataFileReader<GenericRecord> dfr = new DataFileReader<>(sin, new GenericDatumReader<>()); String key = PathUtils.relativizePath(status.getPath(), path).toString(); output.put(key, new RecordIterator(dfr.getSchema(), new AbstractIterator<GenericRecord>() { @Override protected GenericRecord computeNext() { if (dfr.hasNext()) { return dfr.next(); } else { try { dfr.close(); } catch (IOException ioe) { log.error("Failed to close data file reader.", ioe); } endOfData(); return null; } } })); } return output; }
Example #12
Source File: TestAvroExtractor.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
Example #13
Source File: AvroUtilsTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
Example #14
Source File: AvroUtils.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
Example #15
Source File: AvroUtils.java From Cubert with Apache License 2.0 | 5 votes |
/** * Extracts the schema of an Avro file. * * @param conf * @param path * @return * @throws IOException */ public static Schema getSchema(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); Path anAvroFile = FileSystemUtils.getFirstMatch(fs, path, "*.avro", true); if (anAvroFile == null) throw new IOException("there are no files in " + path.toString()); System.out.println("Obtaining schema of avro file " + anAvroFile.toString()); return getSchema(new FsInput(anAvroFile, conf)); }
Example #16
Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0 | 5 votes |
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException { int previewCount = previewBuffer.size(); Path filePath = fileStatus.getPath(); SeekableInput input = new FsInput(filePath, hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); List<Map.Entry> batch = new ArrayList<>(); try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) { int count = 0; while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord> (datum.getSchema())); try { dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); } finally { dataFileWriter.close(); out.close(); } batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray())); count++; previewCount++; } } return batch; }
Example #17
Source File: AvroFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Collection<Object> readData(Configuration conf, Path path) throws IOException { ArrayList<Object> collection = new ArrayList<>(); SeekableInput input = new FsInput(path, conf); DatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(input, reader); for (Object object: fileReader) { collection.add(object); } fileReader.close(); return collection; }
Example #18
Source File: AvroFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Schema getSchema(Configuration conf, Path path) throws IOException { SeekableInput input = new FsInput(path, conf); DatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(input, reader); org.apache.avro.Schema schema = fileReader.getSchema(); fileReader.close(); return avroData.toConnectSchema(schema); }
Example #19
Source File: LobAvroImportTestCase.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
/** * Return an instance of DataFileReader for the given filename. * @param filename path that we're opening a reader for. * @return instance of DataFileReader. * @throws IOException */ private DataFileReader<GenericRecord> read(Path filename) throws IOException { Configuration conf = getConf(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FsInput fsInput = new FsInput(filename, conf); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); return new DataFileReader<GenericRecord>(fsInput, datumReader); }
Example #20
Source File: TestAvroImport.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private DataFileReader<GenericRecord> read(Path filename) throws IOException { Configuration conf = new Configuration(); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } FsInput fsInput = new FsInput(filename, conf); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); return new DataFileReader<GenericRecord>(fsInput, datumReader); }
Example #21
Source File: TestMerge.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record) throws IOException { SeekableInput in = new FsInput(p, new Configuration()); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader); reader.sync(0); while (reader.hasNext()) { if (valueMatches(reader.next(), record)) { return true; } } return false; }
Example #22
Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); SeekableInput in = new FsInput(split.getPath(), conf); DatumReader<T> datumReader = new GenericDatumReader<T>(); this.reader = DataFileReader.openReader(in, datumReader); reader.sync(split.getStart()); // sync to start this.start = reader.tell(); this.end = split.getStart() + split.getLength(); }
Example #23
Source File: AvroUtil.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
/** * Get the schema of AVRO files stored in a directory */ public static Schema getAvroSchema(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); Path fileToTest; if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); if (fileStatuses.length == 0) { return null; } fileToTest = fileStatuses[0].getPath(); } else { fileToTest = path; } SeekableInput input = new FsInput(fileToTest, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); Schema result = fileReader.getSchema(); fileReader.close(); return result; }
Example #24
Source File: AvroUtilities.java From pxf with Apache License 2.0 | 5 votes |
private static Schema readSchemaFromAvroDataSource(Configuration configuration, String dataSource) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); FsInput inStream = new FsInput(new Path(dataSource), configuration); try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(inStream, datumReader)) { return fileReader.getSchema(); } }
Example #25
Source File: AvroDrillTable.java From Bats with Apache License 2.0 | 5 votes |
public AvroDrillTable(String storageEngineName, FileSystemPlugin plugin, SchemaConfig schemaConfig, FormatSelection selection) { super(storageEngineName, plugin, schemaConfig.getUserName(), selection); List<Path> asFiles = selection.getAsFiles(); Path path = asFiles.get(0); this.schemaConfig = schemaConfig; try { reader = new DataFileReader<>(new FsInput(path, plugin.getFsConf()), new GenericDatumReader<GenericContainer>()); } catch (IOException e) { throw UserException.dataReadError(e).build(logger); } }
Example #26
Source File: AvroConversionBaseMapper.java From datacollector with Apache License 2.0 | 4 votes |
@Override protected void map(String input, String output, Context context) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(context.getConfiguration()); Configuration conf = context.getConfiguration(); LOG.info("Converting input file: {}", input); LOG.info("Output directory: {}", output); Path inputPath = new Path(input); Path outputDir = new Path(output); fs.mkdirs(outputDir); Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName()); if(fs.exists(tempFile)) { if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) { fs.delete(tempFile, true); } else { throw new IOException("Temporary file " + tempFile + " already exists."); } } LOG.info("Using temp file: {}", tempFile); // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix(); Path finalFile = new Path(outputDir, outputFileName); LOG.info("Final path will be: {}", finalFile); // Avro reader SeekableInput seekableInput = new FsInput(inputPath, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader); Schema avroSchema = fileReader.getSchema(); initializeWriter(tempFile, avroSchema, conf, context); LOG.info("Started reading input file"); long recordCount = 0; try { while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); handleAvroRecord(record); context.getCounter(Counters.PROCESSED_RECORDS).increment(1); recordCount++; } } catch (Exception e) { // Various random stuff can happen while converting, so we wrap the underlying exception with more details String message = String.format( "Exception at offset %d (record %d): %s", fileReader.tell(), recordCount, e.toString() ); throw new IOException(message, e); } LOG.info("Done reading input file"); closeWriter(); LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile); fs.rename(tempFile, finalFile); if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) { LOG.info("Removing input file", inputPath); fs.delete(inputPath, true); } LOG.info("Done converting input file into output directory {}", output); }
Example #27
Source File: GenerateDictionary.java From Cubert with Apache License 2.0 | 4 votes |
public static Map<String, CodeDictionary> loadDictionary(String path, boolean isHDFS, Configuration conf) throws IOException { Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>(); Schema schema = getSchema(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); DataFileReader<GenericRecord> dataFileReader; if (isHDFS) { dataFileReader = new DataFileReader<GenericRecord>(new FsInput(new Path(path), conf), datumReader); } else { dataFileReader = new DataFileReader<GenericRecord>(new File(path), datumReader); } GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(); String colName = record.get("colname").toString(); String colValue = record.get("colvalue").toString(); int code = (Integer) record.get("code"); CodeDictionary dict = dictionaries.get(colName); if (dict == null) { dict = new CodeDictionary(); dictionaries.put(colName, dict); } dict.addKeyCode(colValue, code); } dataFileReader.close(); return dictionaries; }
Example #28
Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0 | 4 votes |
public void convert(Path avroInputFile, Path orcOutputFile) throws IOException { convert(new FsInput(avroInputFile, configuration), orcOutputFile); }
Example #29
Source File: HdfsReader.java From incubator-gobblin with Apache License 2.0 | 4 votes |
public FsInput getFsInput() throws IOException { Path path = new Path(this.filePathInHdfs); Configuration conf = getConfiguration(); return new FsInput(path, conf); }
Example #30
Source File: AvroAsTextRecordReaderCopy.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public AvroAsTextRecordReaderCopy(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), new GenericDatumReader<T>()), split); }