org.apache.avro.file.DataFileStream Java Exaples

Source File: AvroStorageUtils.java From Cubert with Apache License 2.0

6 votes

/**
 * This method is called by {@link #getAvroSchema}. The default implementation
 * returns the schema of an avro file; or the schema of the last file in a first-level
 * directory (it does not contain sub-directories).
 *
 * @param path  path of a file or first level directory
 * @param fs  file system
 * @return avro schema
 * @throws IOException
 */
public static Schema getSchema(Path path, FileSystem fs) throws IOException {
    /* get path of the last file */
    Path lastFile = AvroStorageUtils.getLast(path, fs);
    if (lastFile == null) {
        return null;
    }

    /* read in file and obtain schema */
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(lastFile);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema ret = avroDataStream.getSchema();
    avroDataStream.close();

    return ret;
}

Source File: PutHiveStreaming.java From localization_nifi with Apache License 2.0

6 votes

private void appendRecordsToFlowFile(ProcessSession session,
                                     List<HiveStreamingRecord> records,
                                     AtomicReference<FlowFile> appendFlowFile,
                                     DataFileWriter<GenericRecord> avroWriter,
                                     DataFileStream<GenericRecord> reader) throws IOException {

    appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {

        try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
            for (HiveStreamingRecord sRecord : records) {
                writer.append(sRecord.getRecord());
            }
            writer.flush();
        }
    }));
}

Source File: TestSelectHive3QL.java From nifi with Apache License 2.0

6 votes

private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}

Source File: TestSplitAvro.java From localization_nifi with Apache License 2.0

6 votes

@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}

Source File: TestSelectHive_1_1QL.java From nifi with Apache License 2.0

6 votes

private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}

Source File: TestSelectHiveQL.java From nifi with Apache License 2.0

6 votes

private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}

Source File: TestJdbcCommon.java From nifi with Apache License 2.0

6 votes

@Test
public void testConvertToAvroStreamForShort() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(Types.TINYINT);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final short s = 25;
    when(rs.getObject(Mockito.anyInt())).thenReturn(s);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Short.toString(s), record.get("t_int").toString());
        }
    }
}

Source File: PutHiveStreaming.java From nifi with Apache License 2.0

6 votes

private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}

Source File: SegmentCreationPhaseMapReduceJob.java From incubator-pinot with Apache License 2.0

6 votes

private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile)
    throws FileNotFoundException, IOException {
  String timeColumnName = schema.getTimeColumnName();
  FieldSpec spec =  schema.getTimeFieldSpec();
  LOGGER.info("Spec for " + timeColumnName + " is " + spec);
  LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null));
  LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector);
  DataFileStream<GenericRecord> dataStream =
      new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>());
  while (dataStream.hasNext()) {
    GenericRecord next = dataStream.next();
    timeColumnStatisticsCollector.collect(next.get(timeColumnName));
  }
  dataStream.close();
  timeColumnStatisticsCollector.seal();

  return timeColumnStatisticsCollector;
}

Source File: AvroDateRangeMetadata.java From datafu with Apache License 2.0

6 votes

/**
 * Reads the date range from the metadata stored in an Avro file.
 * 
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException
{
  path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  
  try
  {
    return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                         new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
  }
  finally
  {
    dataFileStream.close();
    dataInputStream.close();
  }
}

Source File: Examples.java From datafu with Apache License 2.0

6 votes

private Long loadMemberCount(Path path, String timestamp) throws IOException
{
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      GenericRecord r = dataFileStream.next();
      Long count = (Long)((GenericRecord)r.get("value")).get("count");   
      Assert.assertNotNull(count);       
      System.out.println("found count: " + count);
      return count;
    }
    finally
    {
      dataFileStream.close();
    }
  }
  throw new RuntimeException("found no data");
}

Source File: HadoopSegmentPreprocessingJob.java From incubator-pinot with Apache License 2.0

6 votes

/**
 * Finds the avro file in the input folder, and returns its avro schema
 * @param inputPathDir Path to input directory
 * @return Input schema
 * @throws IOException exception when accessing to IO
 */
private Schema getSchema(Path inputPathDir)
    throws IOException {
  FileSystem fs = FileSystem.get(new Configuration());
  Schema avroSchema = null;
  for (FileStatus fileStatus : fs.listStatus(inputPathDir)) {
    if (fileStatus.isFile() && fileStatus.getPath().getName().endsWith(".avro")) {
      _logger.info("Extracting schema from " + fileStatus.getPath());
      try (DataFileStream<GenericRecord> dataStreamReader = getAvroReader(inputPathDir)) {
        avroSchema = dataStreamReader.getSchema();
      }
      break;
    }
  }
  return avroSchema;
}

Source File: PentahoAvroInputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

private DataFileStream<Object> createNestedDataFileStream() throws Exception {
  DatumReader<Object> datumReader;
  if ( useFieldAsInputStream ) {
    datumReader = new GenericDatumReader<Object>();
    inputStream.reset();
    return new DataFileStream<Object>( inputStream, datumReader );
  }
  if ( schemaFileName != null && schemaFileName.length() > 0 ) {
    Schema schema = new Schema.Parser().parse( KettleVFS.getInputStream( schemaFileName, variableSpace ) );
    datumReader = new GenericDatumReader<Object>( schema );
  } else {
    datumReader = new GenericDatumReader<Object>();
  }
  FileObject fileObject = KettleVFS.getFileObject( fileName, variableSpace );
  if ( fileObject.isFile() ) {
    this.inputStream = fileObject.getContent().getInputStream();
    return new DataFileStream<>( inputStream, datumReader );
  } else {
    FileObject[] avroFiles = fileObject.findFiles( new FileExtensionSelector( "avro" ) );
    if ( !Utils.isEmpty( avroFiles ) ) {
      this.inputStream = avroFiles[ 0 ].getContent().getInputStream();
      return new DataFileStream<>( inputStream, datumReader );
    }
    return null;
  }
}

Source File: TestSplitAvro.java From nifi with Apache License 2.0

6 votes

@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}

Source File: TestJdbcCommonConvertToAvro.java From nifi with Apache License 2.0

6 votes

@Test
public void testConvertToAvroStreamForNumbers() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(testParams.sqlType);
    when(metadata.isSigned(1)).thenReturn(testParams.signed);
    when(metadata.getPrecision(1)).thenReturn(testParams.precision);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final int ret = 0;
    when(rs.getObject(Mockito.anyInt())).thenReturn(ret);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Integer.toString(ret), record.get("t_int").toString());
        }
    }
}

Source File: QueryDatabaseTableTest.java From nifi with Apache License 2.0

6 votes

private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}

Source File: TestHDFSCompressedDataStream.java From mt-flume with Apache License 2.0

5 votes

@Test
public void testGzipDurabilityWithSerializer() throws Exception {
  Context context = new Context();
  context.put("serializer", "AVRO_EVENT");

  HDFSCompressedDataStream writer = new HDFSCompressedDataStream();
  writer.configure(context);

  writer.open(fileURI, factory.getCodec(new Path(fileURI)),
      SequenceFile.CompressionType.BLOCK);

  String[] bodies = { "yarf!", "yarfing!" };
  writeBodies(writer, bodies);

  int found = 0;
  int expected = bodies.length;
  List<String> expectedBodies = Lists.newArrayList(bodies);

  GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file));
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> avroStream =
      new DataFileStream<GenericRecord>(cmpIn, reader);
  GenericRecord record = new GenericData.Record(avroStream.getSchema());
  while (avroStream.hasNext()) {
    avroStream.next(record);
    CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
    String bodyStr = decoder.decode((ByteBuffer) record.get("body"))
        .toString();
    expectedBodies.remove(bodyStr);
    found++;
  }
  avroStream.close();
  cmpIn.close();

  Assert.assertTrue("Found = " + found + ", Expected = " + expected
      + ", Left = " + expectedBodies.size() + " " + expectedBodies,
      expectedBodies.size() == 0);
}

Source File: AvroStockFileRead.java From hiped2 with Apache License 2.0

5 votes

public static void dumpStream(InputStream is) throws IOException {
  DataFileStream<Stock> reader =
      new DataFileStream<Stock>(
          is,
          new SpecificDatumReader<Stock>(Stock.class));

  for (Stock a : reader) {
    System.out.println(ToStringBuilder.reflectionToString(a,
        ToStringStyle.SIMPLE_STYLE
    ));
  }

  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}

Source File: AvroUtils.java From incubator-pinot with Apache License 2.0

5 votes

/**
 * Given an Avro data file, map from column to field type and time unit, return the equivalent Pinot schema.
 *
 * @param avroDataFile Avro data file
 * @param fieldTypeMap Map from column to field type
 * @param timeUnit Time unit
 * @return Pinot schema
 */
public static Schema getPinotSchemaFromAvroDataFile(File avroDataFile,
    @Nullable Map<String, FieldSpec.FieldType> fieldTypeMap, @Nullable TimeUnit timeUnit)
    throws IOException {
  try (DataFileStream<GenericRecord> reader = getAvroReader(avroDataFile)) {
    org.apache.avro.Schema avroSchema = reader.getSchema();
    return getPinotSchemaFromAvroSchema(avroSchema, fieldTypeMap, timeUnit);
  }
}

Source File: PartitionPreservingJoinTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions");    
        Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks");         
        Assert.assertFalse(counts.containsKey(memberId));
        ImpressionClick data = new ImpressionClick();
        data.clicks = clicks;
        data.impressions = impressions;
        counts.put(memberId, data);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0

5 votes

/**
 * Get the Avro file reader for the given file.
 */
public static DataFileStream<GenericRecord> getAvroReader(File avroFile)
    throws IOException {
  if (avroFile.getName().endsWith(".gz")) {
    return new DataFileStream<>(new GZIPInputStream(new FileInputStream(avroFile)), new GenericDatumReader<>());
  } else {
    return new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<>());
  }
}

Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: Schemas.java From parquet-mr with Apache License 2.0

5 votes

public static Schema fromAvro(InputStream in) throws IOException {
  GenericDatumReader<GenericRecord> datumReader =
      new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> stream = null;
  boolean threw = true;

  try {
    stream = new DataFileStream<>(in, datumReader);
    Schema schema = stream.getSchema();
    threw = false;
    return schema;
  } finally {
    Closeables.close(stream, threw);
  }
}

Source File: AvroUtils.java From ml-ease with Apache License 2.0

5 votes

/**
 * Loads the schema from an Avro data file.
 * 
 * @param conf The JobConf.
 * @param path The path to the data file.
 * @return The schema read from the data file's metadata.
 * @throws IOException
 */
public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException
{
  FileSystem fs = path.getFileSystem(new Configuration());
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  return dataFileStream.getSchema();
}

Source File: AvroDump.java From hiped2 with Apache License 2.0

5 votes

public static void readFromAvro(InputStream is) throws IOException {
  DataFileStream<Object> reader =
      new DataFileStream<Object>(
          is, new GenericDatumReader<Object>());
  for (Object o : reader) {
    System.out.println(o);
  }
  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}

Source File: TestJdbcCommon.java From nifi with Apache License 2.0

5 votes

@Test
public void testConvertToAvroStreamForUnsignedIntegerWithPrecision10() throws SQLException, IOException {
    final String mockColumnName = "t_int";
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(Types.INTEGER);
    when(metadata.isSigned(1)).thenReturn(false);
    when(metadata.getPrecision(1)).thenReturn(10);
    when(metadata.getColumnName(1)).thenReturn(mockColumnName);
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final Long ret = 0L;
    when(rs.getObject(Mockito.anyInt())).thenReturn(ret);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Long.toString(ret), record.get(mockColumnName).toString());
        }
    }
}

Source File: GeoWaveAvroIngestTest.java From geowave with Apache License 2.0

5 votes

private boolean validate(final URL file) {
  try (DataFileStream<AvroSimpleFeatureCollection> ds =
      new DataFileStream<>(
          file.openStream(),
          new SpecificDatumReader<AvroSimpleFeatureCollection>(
              AvroSimpleFeatureCollection.getClassSchema()))) {
    if (ds.getHeader() != null) {
      return true;
    }
  } catch (final IOException e) {
    // Do nothing for now
  }

  return false;
}

Source File: AvroFileReader.java From ml-ease with Apache License 2.0

5 votes

public <T> void build(String filePath, AvroConsumer<T> builder) throws IOException
{
  
  List<Path> paths = getPaths(filePath);

  for (Path path: paths)
  {
    DataFileStream<Object> stream = null;
    try
    {
      stream = getAvroDataStream(path);
      while (stream.hasNext())
      {
        builder.consume(stream.next());
      }
    }
    finally
    {
      if (stream != null)
      {
        stream.close();
      }
    }
  }
  
  builder.done();
}

Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0

5 votes

private void computeU(JobConf conf, String uPath, String uplusxPath, Map<String, LinearModel> z) throws IOException
{
  AvroHdfsFileWriter<GenericRecord> writer =
      new AvroHdfsFileWriter<GenericRecord>(conf, uPath, LinearModelAvro.SCHEMA$);
  DataFileWriter<GenericRecord> recordwriter = writer.get();
  // read u+x
  for (Path path : Util.findPartFiles(conf, new Path(uplusxPath)))
  {
    DataFileStream<Object> stream = AvroUtils.getAvroDataStream(conf, path);
    while (stream.hasNext())
    {
      GenericData.Record record = (GenericData.Record) stream.next();
      String partitionID = Util.getStringAvro(record, "key", false);
      if (record.get("uplusx") != null)
      {
        String lambda = Util.getLambda(partitionID);
        LinearModel newu =
            new LinearModel(LibLinearDataset.INTERCEPT_NAME, (List<?>) record.get("uplusx"));
        newu.linearCombine(1.0, -1.0, z.get(lambda));
        GenericData.Record newvaluemap =
            new GenericData.Record(LinearModelAvro.SCHEMA$);
        List modellist = newu.toAvro(LibLinearDataset.INTERCEPT_NAME);
        newvaluemap.put("key", partitionID);
        newvaluemap.put("model", modellist);
        recordwriter.append(newvaluemap);
      }
    }
  }
  recordwriter.close();
}

Source File: AvroStockAvgFileRead.java From hiped2 with Apache License 2.0

5 votes

public static void readFromAvro(InputStream is) throws IOException {
  DataFileStream<StockAvg> reader =     //<co id="ch03_smallfileread_comment1"/>
      new DataFileStream<StockAvg>(
          is,
          new SpecificDatumReader<StockAvg>(StockAvg.class));

  for (StockAvg a : reader) {          //<co id="ch03_smallfileread_comment2"/>
    System.out.println(ToStringBuilder.reflectionToString(a,
        ToStringStyle.SHORT_PREFIX_STYLE
    ));
  }

  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}

org.apache.avro.file.DataFileStream Java Examples