Java Code Examples for org.apache.hadoop.fs.FSDataOutputStream#flush()
The following examples show how to use
org.apache.hadoop.fs.FSDataOutputStream#flush() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HBaseFsck.java From hbase with Apache License 2.0 | 6 votes |
@Override public FSDataOutputStream call() throws IOException { try { FileSystem fs = CommonFSUtils.getCurrentFileSystem(this.conf); FsPermission defaultPerms = CommonFSUtils.getFilePermissions(fs, this.conf, HConstants.DATA_FILE_UMASK_KEY); Path tmpDir = getTmpDir(conf); this.hbckLockPath = new Path(tmpDir, HBCK_LOCK_FILE); fs.mkdirs(tmpDir); final FSDataOutputStream out = createFileWithRetries(fs, this.hbckLockPath, defaultPerms); out.writeBytes(InetAddress.getLocalHost().toString()); // Add a note into the file we write on why hbase2 is writing out an hbck1 lock file. out.writeBytes(" Written by an hbase-2.x Master to block an " + "attempt by an hbase-1.x HBCK tool making modification to state. " + "See 'HBCK must match HBase server version' in the hbase refguide."); out.flush(); return out; } catch(RemoteException e) { if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){ return null; } else { throw e; } } }
Example 2
Source File: TestS3NInMemoryFileSystem.java From big-c with Apache License 2.0 | 6 votes |
public void testBasicReadWriteIO() throws IOException { FSDataOutputStream writeData = fs.create(new Path(TEST_PATH)); writeData.write(TEST_DATA.getBytes()); writeData.flush(); writeData.close(); FSDataInputStream readData = fs.open(new Path(TEST_PATH)); BufferedReader br = new BufferedReader(new InputStreamReader(readData)); String line = ""; StringBuffer stringBuffer = new StringBuffer(); while ((line = br.readLine()) != null) { stringBuffer.append(line); } br.close(); assert(TEST_DATA.equals(stringBuffer.toString())); }
Example 3
Source File: TestS3InMemoryFileSystem.java From hadoop with Apache License 2.0 | 6 votes |
public void testBasicReadWriteIO() throws IOException { FSDataOutputStream writeStream = fs.create(new Path(TEST_PATH)); writeStream.write(TEST_DATA.getBytes()); writeStream.flush(); writeStream.close(); FSDataInputStream readStream = fs.open(new Path(TEST_PATH)); BufferedReader br = new BufferedReader(new InputStreamReader(readStream)); String line = ""; StringBuffer stringBuffer = new StringBuffer(); while ((line = br.readLine()) != null) { stringBuffer.append(line); } br.close(); assert(TEST_DATA.equals(stringBuffer.toString())); }
Example 4
Source File: TestStochasticLoadBalancerHeterogeneousCostRules.java From hbase with Apache License 2.0 | 6 votes |
@Test public void testLoadingFomHDFS() throws Exception { HTU.startMiniDFSCluster(3); try { MiniDFSCluster cluster = HTU.getDFSCluster(); DistributedFileSystem fs = cluster.getFileSystem(); // Writing file Path path = new Path(fs.getHomeDirectory(), DEFAULT_RULES_FILE_NAME); FSDataOutputStream stream = fs.create(path); stream.write("server1 10".getBytes()); stream.flush(); stream.close(); Configuration configuration = HTU.getConfiguration(); // start costFunction configuration.set( HeterogeneousRegionCountCostFunction.HBASE_MASTER_BALANCER_HETEROGENEOUS_RULES_FILE, path.toString()); this.costFunction = new HeterogeneousRegionCountCostFunction(configuration); this.costFunction.loadRules(); Assert.assertEquals(1, this.costFunction.getNumberOfRulesLoaded()); } finally { HTU.shutdownMiniCluster(); } }
Example 5
Source File: TestS3InMemoryFileSystem.java From big-c with Apache License 2.0 | 6 votes |
public void testBasicReadWriteIO() throws IOException { FSDataOutputStream writeStream = fs.create(new Path(TEST_PATH)); writeStream.write(TEST_DATA.getBytes()); writeStream.flush(); writeStream.close(); FSDataInputStream readStream = fs.open(new Path(TEST_PATH)); BufferedReader br = new BufferedReader(new InputStreamReader(readStream)); String line = ""; StringBuffer stringBuffer = new StringBuffer(); while ((line = br.readLine()) != null) { stringBuffer.append(line); } br.close(); assert(TEST_DATA.equals(stringBuffer.toString())); }
Example 6
Source File: TestBlockUnderConstruction.java From hadoop with Apache License 2.0 | 6 votes |
void writeFile(Path file, FSDataOutputStream stm, int size) throws IOException { long blocksBefore = stm.getPos() / BLOCK_SIZE; TestFileCreation.writeFile(stm, BLOCK_SIZE); // need to make sure the full block is completely flushed to the DataNodes // (see FSOutputSummer#flush) stm.flush(); int blocksAfter = 0; // wait until the block is allocated by DataStreamer BlockLocation[] locatedBlocks; while(blocksAfter <= blocksBefore) { locatedBlocks = DFSClientAdapter.getDFSClient(hdfs).getBlockLocations( file.toString(), 0L, BLOCK_SIZE*NUM_BLOCKS); blocksAfter = locatedBlocks == null ? 0 : locatedBlocks.length; } }
Example 7
Source File: TestHDFSIntegration.java From incubator-sentry with Apache License 2.0 | 6 votes |
private void loadDataTwoCols(Statement stmt) throws IOException, SQLException { FSDataOutputStream f1 = miniDFS.getFileSystem().create(new Path("/tmp/f2.txt")); f1.writeChars("m1d1_t1, m1d1_t2\n"); f1.writeChars("m1d1_t2, m1d1_t2\n"); f1.writeChars("m1d1_t3, m1d1_t2\n"); f1.flush(); f1.close(); stmt.execute("load data inpath \'/tmp/f2.txt\' overwrite into table p1 partition (month=1, day=1)"); ResultSet rs = stmt.executeQuery("select * from p1"); List<String> vals = new ArrayList<String>(); while (rs.next()) { vals.add(rs.getString(1)); } Assert.assertEquals(3, vals.size()); rs.close(); }
Example 8
Source File: TestInLineFileSystem.java From hudi with Apache License 2.0 | 6 votes |
private OuterPathInfo generateOuterFileAndGetInfo(int inlineContentSize) throws IOException { OuterPathInfo toReturn = new OuterPathInfo(); Path outerPath = getRandomOuterFSPath(); listOfGeneratedPaths.add(outerPath); toReturn.outerPath = outerPath; FSDataOutputStream wrappedOut = outerPath.getFileSystem(conf).create(outerPath, true); // append random bytes byte[] randomBytes = new byte[RANDOM.nextInt(1000)]; RANDOM.nextBytes(randomBytes); wrappedOut.write(randomBytes); toReturn.startOffset = wrappedOut.getPos(); // add inline content byte[] embeddedInlineBytes = new byte[inlineContentSize]; RANDOM.nextBytes(embeddedInlineBytes); wrappedOut.write(embeddedInlineBytes); toReturn.expectedBytes = embeddedInlineBytes; toReturn.length = embeddedInlineBytes.length; // suffix random bytes randomBytes = new byte[RANDOM.nextInt(1000)]; RANDOM.nextBytes(randomBytes); wrappedOut.write(randomBytes); wrappedOut.flush(); wrappedOut.close(); return toReturn; }
Example 9
Source File: TestBlockToken.java From big-c with Apache License 2.0 | 5 votes |
/** * This test writes a file and gets the block locations without closing the * file, and tests the block token in the last block. Block token is verified * by ensuring it is of correct kind. * * @throws IOException * @throws InterruptedException */ @Test public void testBlockTokenInLastLocatedBlock() throws IOException, InterruptedException { Configuration conf = new HdfsConfiguration(); conf.setBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 512); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(1).build(); cluster.waitActive(); try { FileSystem fs = cluster.getFileSystem(); String fileName = "/testBlockTokenInLastLocatedBlock"; Path filePath = new Path(fileName); FSDataOutputStream out = fs.create(filePath, (short) 1); out.write(new byte[1000]); // ensure that the first block is written out (see FSOutputSummer#flush) out.flush(); LocatedBlocks locatedBlocks = cluster.getNameNodeRpc().getBlockLocations( fileName, 0, 1000); while (locatedBlocks.getLastLocatedBlock() == null) { Thread.sleep(100); locatedBlocks = cluster.getNameNodeRpc().getBlockLocations(fileName, 0, 1000); } Token<BlockTokenIdentifier> token = locatedBlocks.getLastLocatedBlock() .getBlockToken(); Assert.assertEquals(BlockTokenIdentifier.KIND_NAME, token.getKind()); out.close(); } finally { cluster.shutdown(); } }
Example 10
Source File: HdfsIOBenchmark.java From incubator-crail with Apache License 2.0 | 5 votes |
public void writeSequentialHeap() throws Exception { System.out.println("writing sequential file in heap mode " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FSDataOutputStream instream = fs.create(path); byte[] buf = new byte[size]; double sumbytes = 0; double ops = 0; System.out.println("read size " + size); System.out.println("operations " + loop); long start = System.currentTimeMillis(); while (ops < loop) { // System.out.println("writing data, len " + buf.length); instream.write(buf, 0, buf.length); sumbytes = sumbytes + buf.length; ops = ops + 1.0; } instream.flush(); long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)) / 1000.0; double throughput = 0.0; double latency = 0.0; double sumbits = sumbytes * 8.0; if (executionTime > 0) { throughput = sumbits / executionTime / 1024.0 / 1024.0; latency = 1000000.0 * executionTime / ops; } System.out.println("execution time " + executionTime); System.out.println("ops " + ops); System.out.println("sumbytes " + sumbytes); System.out.println("throughput " + throughput); System.out.println("latency " + latency); System.out.println("closing stream"); instream.close(); fs.close(); }
Example 11
Source File: JobClasspathHelper.java From kite with Apache License 2.0 | 5 votes |
/** * This method creates an file that contains a line with a MD5 sum * * @param fs * FileSystem where to create the file. * @param md5sum * The string containing the MD5 sum. * @param remoteMd5Path * The path where to save the file. * @throws IOException */ private void createMd5SumFile(FileSystem fs, String md5sum, Path remoteMd5Path) throws IOException { FSDataOutputStream os = null; try { os = fs.create(remoteMd5Path, true); os.writeBytes(md5sum); os.flush(); } catch (Exception e) { LOG.error("{}", e); } finally { if (os != null) { os.close(); } } }
Example 12
Source File: TestLogalyzer.java From hadoop with Apache License 2.0 | 5 votes |
/** * Create simple log file * * @return * @throws IOException */ private Path createLogFile() throws IOException { FileContext files = FileContext.getLocalFSFileContext(); Path ws = new Path(workSpace.getAbsoluteFile().getAbsolutePath()); files.delete(ws, true); Path workSpacePath = new Path(workSpace.getAbsolutePath(), "log"); files.mkdir(workSpacePath, null, true); LOG.info("create logfile.log"); Path logfile1 = new Path(workSpacePath, "logfile.log"); FSDataOutputStream os = files.create(logfile1, EnumSet.of(CreateFlag.CREATE)); os.writeBytes("4 3" + EL + "1 3" + EL + "4 44" + EL); os.writeBytes("2 3" + EL + "1 3" + EL + "0 45" + EL); os.writeBytes("4 3" + EL + "1 3" + EL + "1 44" + EL); os.flush(); os.close(); LOG.info("create logfile1.log"); Path logfile2 = new Path(workSpacePath, "logfile1.log"); os = files.create(logfile2, EnumSet.of(CreateFlag.CREATE)); os.writeBytes("4 3" + EL + "1 3" + EL + "3 44" + EL); os.writeBytes("2 3" + EL + "1 3" + EL + "0 45" + EL); os.writeBytes("4 3" + EL + "1 3" + EL + "1 44" + EL); os.flush(); os.close(); return workSpacePath; }
Example 13
Source File: TestWasbFsck.java From hadoop with Apache License 2.0 | 5 votes |
/** * Tests that we recover files properly */ @Test @Ignore /* flush() no longer does anything @@TODO: reinstate an appropriate test of fsck recovery*/ public void testRecover() throws Exception { Path danglingFile = new Path("/crashedInTheMiddle"); // Create a file and leave it dangling and try to recover it. FSDataOutputStream stream = fs.create(danglingFile); stream.write(new byte[] { 1, 2, 3 }); stream.flush(); // Now we should still only see a zero-byte file in this place FileStatus fileStatus = fs.getFileStatus(danglingFile); assertNotNull(fileStatus); assertEquals(0, fileStatus.getLen()); assertEquals(1, getNumTempBlobs()); // Run WasbFsck -move to recover the file. runFsck("-move"); // Now we should the see the file in lost+found with the data there. fileStatus = fs.getFileStatus(new Path("/lost+found", danglingFile.getName())); assertNotNull(fileStatus); assertEquals(3, fileStatus.getLen()); assertEquals(0, getNumTempBlobs()); // But not in its original location assertFalse(fs.exists(danglingFile)); }
Example 14
Source File: TestLineReader.java From tajo with Apache License 2.0 | 5 votes |
@Test public void testCRLFLine() throws IOException { TajoConf conf = new TajoConf(); Path testFile = new Path(CommonTestingUtil.getTestDir(TEST_PATH), "testCRLFLineText.txt"); FileSystem fs = testFile.getFileSystem(conf); FSDataOutputStream outputStream = fs.create(testFile, true); outputStream.write("0\r\n1\r\n".getBytes()); outputStream.flush(); IOUtils.closeStream(outputStream); ByteBufInputChannel channel = new ByteBufInputChannel(fs.open(testFile)); ByteBufLineReader reader = new ByteBufLineReader(channel, BufferPool.directBuffer(2)); FileStatus status = fs.getFileStatus(testFile); long totalRead = 0; int i = 0; AtomicInteger bytes = new AtomicInteger(); for(;;){ ByteBuf buf = reader.readLineBuf(bytes); totalRead += bytes.get(); if(buf == null) break; String row = buf.toString(Charset.defaultCharset()); assertEquals(i, Integer.parseInt(row)); i++; } IOUtils.cleanup(null, reader); assertFalse(channel.isOpen()); assertEquals(status.getLen(), totalRead); assertEquals(status.getLen(), reader.readBytes()); }
Example 15
Source File: CubertMD.java From Cubert with Apache License 2.0 | 5 votes |
private static void writeMetaFile(String metaFilePath, HashMap<String, String> metaFileKeyValues) throws IOException { Job tempjob = new Job(); Configuration tempconf = tempjob.getConfiguration(); FileSystem fs = FileSystem.get(tempconf); FSDataOutputStream outStream = fs.create(new Path(metaFilePath + "/.meta")); for (String key : metaFileKeyValues.keySet()) outStream.write((key + " " + metaFileKeyValues.get(key) + "\n").getBytes()); outStream.flush(); outStream.close(); }
Example 16
Source File: CrawlDBTestUtil.java From anthelion with Apache License 2.0 | 5 votes |
/** * Generate seedlist * @throws IOException */ public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls, List<String>metadata) throws IOException{ FSDataOutputStream out; Path file=new Path(urlPath,"urls.txt"); fs.mkdirs(urlPath); out=fs.create(file); Iterator<String> urls_i=urls.iterator(); Iterator<String> metadata_i=metadata.iterator(); String url; String md; while(urls_i.hasNext()){ url=urls_i.next(); out.writeBytes(url); if (metadata_i.hasNext()) { md = metadata_i.next(); out.writeBytes(md); } out.writeBytes("\n"); } out.flush(); out.close(); }
Example 17
Source File: VectorizedHashAggPartitionSpillHandler.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Called by {@link VectorizedHashAggOperator} to ensure all spilled partitions are * entirely on disk. This is needed before the operator starts pumping out data from * in-memory partitions. * * The reason we do this is because once a partition has been spilled, it is very likely * that there would be subsequent incoming data belonging to the spilled partition. We will * continue to do the in-memory aggregation (contraction) or buffering (non-contraction). * The partition may or may not get spilled again. However, once all the input data has * been processed by the operator, spilled partition(s) could be holding data in-memory * and we need to spill/flush this data too. * * @throws Exception */ public void spillAnyInMemoryDataForSpilledPartitions() throws Exception { /* get a local reference for efficiency */ final List<VectorizedHashAggDiskPartition> activeSpilledPartitions = this.activeSpilledPartitions; /* spill the memory portion of each spilled partition */ for (VectorizedHashAggDiskPartition partitionToSpill : activeSpilledPartitions) { /* the in-memory portion of partition could be empty if after the partition * was spilled, no incoming data ever mapped to that particular partition. * the writeToStream() function that spills partition's data structures * is aware of this fact and is a NOOP if partition is empty. */ final VectorizedHashAggPartition inmemoryPartition = partitionToSpill.getInmemoryPartitionBackPointer(); final SpillFile partitionSpillFile = partitionToSpill.getSpillFile(); final VectorizedHashAggPartitionSerializable partitionSerializable = new VectorizedHashAggPartitionSerializable(inmemoryPartition, this.operatorStats, this.warnMaxSpillTime); FSDataOutputStream outputStream = partitionToSpill.getSpillStream(); /* write the partition to disk */ partitionSerializable.writeToStream(outputStream); /* track number of spills */ spills++; /* downsize the partition to minimum memory (ideally zeroed out for single batch) we would still like to keep allocated */ inmemoryPartition.resetToMinimumSize(); final long batchesSpilled = partitionSerializable.getNumBatchesSpilled(); final long recordsSpilled = partitionSerializable.getNumRecordsSpilled(); final long spilledDataSize = partitionSerializable.getSpilledDataSize(); updateLocalStats(batchesSpilled, recordsSpilled, spilledDataSize); partitionToSpill.addNewSpilledBatches(batchesSpilled); logger.debug("Flushed in-memory data for partition: {}, batches spilled: {}, spill file path: {}", inmemoryPartition.getIdentifier(), batchesSpilled, partitionSpillFile.getPath()); outputStream.flush(); /* no more to spill for this partition since operator will start a new iteration, so released the cached handle */ partitionToSpill.closeSpillStream(); } }
Example 18
Source File: TestHoodieLogFormat.java From hudi with Apache License 2.0 | 4 votes |
@Test public void testAvroLogRecordReaderWithRollbackPartialBlock() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieDataBlock dataBlock = getDataBlock(records1, header); writer = writer.appendBlock(dataBlock); writer.close(); // Write 2 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(1000); outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); // Write out some header outputStream.write(HoodieLogBlock.getLogMetadataBytes(header)); outputStream.writeLong("something-random".getBytes().length); outputStream.write("something-random".getBytes()); outputStream.flush(); outputStream.close(); // Rollback the last write header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102"); header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); writer = writer.appendBlock(commandBlock); // Write 3 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103"); List<IndexedRecord> records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords3 = records3.stream() .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); dataBlock = getDataBlock(records3, header); writer = writer.appendBlock(dataBlock); writer.close(); List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100") .map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "103", 10240L, true, false, bufferSize, BASE_OUTPUT_PATH); assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records"); Set<String> readKeys = new HashSet<>(200); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals(200, readKeys.size(), "Stream collect should return all 200 records"); copyOfRecords1.addAll(copyOfRecords3); Set<String> originalKeys = copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString()) .collect(Collectors.toSet()); assertEquals(originalKeys, readKeys, "CompositeAvroLogReader should return 200 records from 2 versions"); }
Example 19
Source File: TestHoodieLogFormat.java From hudi with Apache License 2.0 | 4 votes |
@Test public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieDataBlock dataBlock = getDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(474); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION); // Write out a length that does not confirm with the content outputStream.writeLong(400); // Write out incomplete content outputStream.write("something-random".getBytes()); outputStream.flush(); outputStream.close(); // Append a proper block that is of the missing length of the corrupted block writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 10); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = getDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // First round of reads - we should be able to read the first block and then EOF Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "First block should be available"); reader.next(); assertTrue(reader.hasNext(), "We should have corrupted block next"); HoodieLogBlock block = reader.next(); assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block"); assertTrue(reader.hasNext(), "Third block should be available"); reader.next(); assertFalse(reader.hasNext(), "There should be no more block left"); reader.close(); // Simulate another failure back to back outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); // Write out a length that does not confirm with the content outputStream.writeLong(1000); outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal()); outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION); // Write out a length that does not confirm with the content outputStream.writeLong(500); // Write out some bytes outputStream.write("something-else-random".getBytes()); outputStream.flush(); outputStream.close(); // Should be able to append a new block writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); dataBlock = getDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); // Second round of reads - we should be able to read the first and last block reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue(reader.hasNext(), "First block should be available"); reader.next(); assertTrue(reader.hasNext(), "We should get the 1st corrupted block next"); reader.next(); assertTrue(reader.hasNext(), "Third block should be available"); reader.next(); assertTrue(reader.hasNext(), "We should get the 2nd corrupted block next"); block = reader.next(); assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block"); assertTrue(reader.hasNext(), "We should get the last block next"); reader.next(); assertFalse(reader.hasNext(), "We should have no more blocks left"); reader.close(); }
Example 20
Source File: TrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0 | 4 votes |
private void binaryEvaluation(DataFrame predictions, String output, TrainingSettings trainingSettings) throws IOException { FileSystem fs = FileSystem.get(new Configuration()); Path evalPath = new Path(output+"binary_evaluation_"+trainingSettings.getClassificationMethod()+".txt"); fs.delete(evalPath, true); FSDataOutputStream fsdos = fs.create(evalPath); BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictions .select("rawPrediction", "label") .javaRDD() .map((Row row) -> { Vector vector = row.getAs("rawPrediction"); Double label = row.getAs("label"); return new Tuple2<Object, Object>(vector.apply(1), label); }).rdd()); // Precision by threshold JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD(); IOUtils.write("\nPrecision by threshold: " + precision.collect(), fsdos); // Recall by threshold JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD(); IOUtils.write("\nRecall by threshold: " + recall.collect(), fsdos); // F Score by threshold JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); IOUtils.write("\nF1 Score by threshold: " + f1Score.collect(), fsdos); JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); IOUtils.write("\nF2 Score by threshold: " + f2Score.collect(), fsdos); // Precision-recall curve JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD(); IOUtils.write("\nPrecision-recall curve: " + prc.collect(), fsdos); // Thresholds JavaRDD<Double> thresholds = precision.map(t -> new Double(t._1().toString())); // ROC Curve JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD(); IOUtils.write("\nROC curve: " + roc.collect(), fsdos); // AUPRC IOUtils.write("\nArea under precision-recall curve = " + metrics.areaUnderPR(), fsdos); // AUROC IOUtils.write("\nArea under ROC = " + metrics.areaUnderROC(), fsdos); fsdos.flush(); IOUtils.closeQuietly(fsdos); }