org.apache.avro.file.DataFileStream#next

Source File: Examples.java From datafu with Apache License 2.0

6 votes

private Long loadMemberCount(Path path, String timestamp) throws IOException
{
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      GenericRecord r = dataFileStream.next();
      Long count = (Long)((GenericRecord)r.get("value")).get("count");   
      Assert.assertNotNull(count);       
      System.out.println("found count: " + count);
      return count;
    }
    finally
    {
      dataFileStream.close();
    }
  }
  throw new RuntimeException("found no data");
}

Source File: TestAvroStorage.java From spork with Apache License 2.0

5 votes

private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException {
    FileSystem fs = FileSystem.getLocal(new Configuration()) ;

    /* read in expected results*/
    Set<GenericData.Record> expected = getExpected (expectedOutpath);

    /* read in output results and compare */
    Path output = new Path(outPath);
    assertTrue("Output dir does not exists!", fs.exists(output)
            && fs.getFileStatus(output).isDir());

    Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
    assertTrue("Split field dirs not found!", paths != null);

    for (Path path : paths) {
      Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
      assertTrue("No files found for path: " + path.toUri().getPath(),
              files != null);
      for (Path filePath : files) {
        assertTrue("This shouldn't be a directory", fs.isFile(filePath));

        GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

        DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(
                                        fs.open(filePath), reader);
        assertEquals("codec", expectedCodec, in.getMetaString("avro.codec"));
        int count = 0;
        while (in.hasNext()) {
            GenericData.Record obj = in.next();
            assertTrue("Avro result object found that's not expected: Found "
                    + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString()
                    + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n"
                    , expected.contains(obj));
            count++;
        }
        in.close();
        assertEquals(expected.size(), count);
      }
    }
}

Source File: PartitionCollapsingJoinTest.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions");    
        Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks");         
        Assert.assertFalse(counts.containsKey(memberId));
        ImpressionClick data = new ImpressionClick();
        data.clicks = clicks;
        data.impressions = impressions;
        counts.put(memberId, data);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: PartitionCollapsingTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: TestAvroJob.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)r.get("id");
        Long count = (Long)r.get("count");   
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: TestHDFSCompressedDataStream.java From mt-flume with Apache License 2.0

5 votes

@Test
public void testGzipDurabilityWithSerializer() throws Exception {
  Context context = new Context();
  context.put("serializer", "AVRO_EVENT");

  HDFSCompressedDataStream writer = new HDFSCompressedDataStream();
  writer.configure(context);

  writer.open(fileURI, factory.getCodec(new Path(fileURI)),
      SequenceFile.CompressionType.BLOCK);

  String[] bodies = { "yarf!", "yarfing!" };
  writeBodies(writer, bodies);

  int found = 0;
  int expected = bodies.length;
  List<String> expectedBodies = Lists.newArrayList(bodies);

  GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file));
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> avroStream =
      new DataFileStream<GenericRecord>(cmpIn, reader);
  GenericRecord record = new GenericData.Record(avroStream.getSchema());
  while (avroStream.hasNext()) {
    avroStream.next(record);
    CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
    String bodyStr = decoder.decode((ByteBuffer) record.get("body"))
        .toString();
    expectedBodies.remove(bodyStr);
    found++;
  }
  avroStream.close();
  cmpIn.close();

  Assert.assertTrue("Found = " + found + ", Expected = " + expected
      + ", Left = " + expectedBodies.size() + " " + expectedBodies,
      expectedBodies.size() == 0);
}

Source File: Examples.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Integer> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Integer> counts = new HashMap<Long,Integer>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {          
        GenericRecord r = dataFileStream.next();
        _log.info("found: " + r.toString());
        Long memberId = (Long)((GenericRecord)r.get("key")).get("member_id");
        Assert.assertNotNull(memberId);
        Integer count = (Integer)((GenericRecord)r.get("value")).get("count");   
        Assert.assertNotNull(count);     
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadIntermediateCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_intermediatePath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Source File: TestAvroStorage.java From spork with Apache License 2.0

5 votes

private Set<GenericData.Record> getExpected (String pathstr ) throws IOException {

        Set<GenericData.Record> ret = new TreeSet<GenericData.Record>(
                new Comparator<GenericData.Record>() {
                    @Override
                    public int compare(Record o1, Record o2) {
                        return o1.toString().compareTo(o2.toString());
                    }}
                );
        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in output results and compare */
        Path output = new Path(pathstr);
        assertTrue("Expected output does not exists!", fs.exists(output));

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

                DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath), reader);

                while (in.hasNext()) {
                    GenericData.Record obj = in.next();
                    ret.add(obj);
                }
                in.close();
            }
        }
        return ret;
    }

Source File: TestWriteAvroResultWithSchema.java From nifi with Apache License 2.0

5 votes

@Override
protected GenericRecord readRecord(final InputStream in, final Schema schema) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    return avroRecord;
}

Source File: TestHDFSEventSink.java From mt-flume with Apache License 2.0

5 votes

private void verifyOutputAvroFiles(FileSystem fs, Configuration conf, String dir, String prefix, List<String> bodies) throws IOException {
  int found = 0;
  int expected = bodies.size();
  for(String outputFile : getAllFiles(dir)) {
    String name = (new File(outputFile)).getName();
    if(name.startsWith(prefix)) {
      FSDataInputStream input = fs.open(new Path(outputFile));
      DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
      DataFileStream<GenericRecord> avroStream =
          new DataFileStream<GenericRecord>(input, reader);
      GenericRecord record = new GenericData.Record(avroStream.getSchema());
      while (avroStream.hasNext()) {
        avroStream.next(record);
        ByteBuffer body = (ByteBuffer) record.get("body");
        CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
        String bodyStr = decoder.decode(body).toString();
        LOG.debug("Removing event: {}", bodyStr);
        bodies.remove(bodyStr);
        found++;
      }
      avroStream.close();
      input.close();
    }
  }
  Assert.assertTrue("Found = " + found + ", Expected = "  +
      expected + ", Left = " + bodies.size() + " " + bodies,
        bodies.size() == 0);
}

Source File: TestExecuteSQLRecord.java From nifi with Apache License 2.0

4 votes

@Test
public void testWriteLOBsToAvro() throws Exception {
    final DBCPService dbcp = new DBCPServiceSimpleImpl("h2");
    final Map<String, String> dbcpProperties = new HashMap<>();

    runner = TestRunners.newTestRunner(ExecuteSQLRecord.class);
    runner.addControllerService("dbcp", dbcp, dbcpProperties);
    runner.enableControllerService(dbcp);
    runner.setProperty(AbstractExecuteSQL.DBCP_SERVICE, "dbcp");

    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();

    // load test data to database
    final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();

    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    }

    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, image blob(1K), words clob(1K), "
            + "natwords nclob(1K), constraint my_pk primary key (id))");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2, image, words, natwords) VALUES (0, NULL, 1, CAST (X'DEADBEEF' AS BLOB), "
            + "CAST ('Hello World' AS CLOB), CAST ('I am an NCLOB' AS NCLOB))");

    runner.setIncomingConnection(false);
    runner.setProperty(AbstractExecuteSQL.SQL_SELECT_QUERY, "select * from TEST_NULL_INT");
    AvroRecordSetWriter recordWriter = new AvroRecordSetWriter();
    runner.addControllerService("writer", recordWriter);
    runner.setProperty(recordWriter, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.INHERIT_RECORD_SCHEMA);
    runner.setProperty(ExecuteSQLRecord.RECORD_WRITER_FACTORY, "writer");
    runner.enableControllerService(recordWriter);
    runner.run();

    runner.assertAllFlowFilesTransferred(AbstractExecuteSQL.REL_SUCCESS, 1);
    MockFlowFile flowFile = runner.getFlowFilesForRelationship(AbstractExecuteSQL.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals(AbstractExecuteSQL.RESULT_ROW_COUNT, "1");

    ByteArrayInputStream bais = new ByteArrayInputStream(flowFile.toByteArray());
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(bais, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, GenericData.StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    Object imageObj = avroRecord.get("IMAGE");
    assertNotNull(imageObj);
    assertTrue(imageObj instanceof ByteBuffer);
    assertArrayEquals(new byte[]{(byte) 0xDE, (byte) 0xAD, (byte) 0xBE, (byte) 0xEF}, ((ByteBuffer) imageObj).array());

    Object wordsObj = avroRecord.get("WORDS");
    assertNotNull(wordsObj);
    assertTrue(wordsObj instanceof Utf8);
    assertEquals("Hello World", wordsObj.toString());

    Object natwordsObj = avroRecord.get("NATWORDS");
    assertNotNull(natwordsObj);
    assertTrue(natwordsObj instanceof Utf8);
    assertEquals("I am an NCLOB", natwordsObj.toString());
}

Source File: BucketingSinkTest.java From flink with Apache License 2.0

4 votes

/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}

Source File: BucketingSinkTest.java From flink with Apache License 2.0

4 votes

/**
 * This tests {@link AvroKeyValueSinkWriter}
 * with non-rolling output and with compression.
 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
	final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}

Source File: WholeFileTransformerProcessor.java From datacollector with Apache License 2.0

4 votes

/**
 * Convert Avro record to Parquet
 * @param sourceFileName the source Avro file name
 * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader
 * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path
 */
private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException {
  long recordCount = 0;
  GenericRecord avroRecord;
  Schema schema = fileReader.getSchema();

  LOG.debug("Start reading input file : {}", sourceFileName);
  try {
    // initialize parquet writer
    Configuration jobConfiguration = new Configuration();
    String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class);
    jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName);
    jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize);
    jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize);
    jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize);
    jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize);

    // Parquet writer
    ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(
        new org.apache.hadoop.fs.Path(tempParquetFile.toString()),
        schema,
        jobConfiguration
    );
    parquetWriter = builder.build();

    while (fileReader.hasNext()) {
      avroRecord = fileReader.next();
      parquetWriter.write(avroRecord);
      recordCount++;
    }
    parquetWriter.close();

  } catch (IOException ex) {
    throw new TransformerStageCheckedException(
        Errors.CONVERT_08,
        sourceFileName,
        recordCount,
        ex
    );
  }
  LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName());
}

Source File: TestPutHiveStreaming.java From localization_nifi with Apache License 2.0

4 votes

@Test
public void onTriggerMultipleRecords() throws Exception {
    runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
    runner.setProperty(PutHiveStreaming.DB_NAME, "default");
    runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
    runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "2");
    runner.setValidateExpressionUsage(false);
    Map<String, Object> user1 = new HashMap<String, Object>() {
        {
            put("name", "Joe");
            put("favorite_number", 146);
        }
    };
    Map<String, Object> user2 = new HashMap<String, Object>() {
        {
            put("name", "Mary");
            put("favorite_number", 42);
        }
    };
    Map<String, Object> user3 = new HashMap<String, Object>() {
        {
            put("name", "Matt");
            put("favorite_number", 3);
        }
    };
    runner.enqueue(createAvroRecord(Arrays.asList(user1, user2, user3)));
    runner.run();

    runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0);
    assertNotNull(resultFlowFile);
    assertEquals("3", resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));
    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<GenericRecord>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));

    // Verify the records are intact. We can't guarantee order so check the total number and non-null fields
    assertTrue(reader.hasNext());
    GenericRecord record = reader.next(null);
    assertNotNull(record.get("name"));
    assertNotNull(record.get("favorite_number"));
    assertNull(record.get("favorite_color"));
    assertNull(record.get("scale"));
    assertTrue(reader.hasNext());
    record = reader.next(record);
    assertTrue(reader.hasNext());
    reader.next(record);
    assertFalse(reader.hasNext());
}

Source File: BucketingSinkTest.java From flink with Apache License 2.0

4 votes

/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}

Source File: BucketingSinkTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}

Source File: BucketingSinkTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * This tests {@link AvroKeyValueSinkWriter}
 * with non-rolling output and with compression.
 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
	final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}

Java Code Examples for org.apache.avro.file.DataFileStream#next()