org.apache.hadoop.fs.FileSystem#getFileBlockLocations

Source File: TestNativeAzureFileSystemBlockLocations.java From big-c with Apache License 2.0

6 votes

private static BlockLocation[] getBlockLocationsOutput(int fileSize,
    int blockSize, long start, long len, String blockLocationHost)
    throws Exception {
  Configuration conf = new Configuration();
  conf.set(NativeAzureFileSystem.AZURE_BLOCK_SIZE_PROPERTY_NAME, ""
      + blockSize);
  if (blockLocationHost != null) {
    conf.set(NativeAzureFileSystem.AZURE_BLOCK_LOCATION_HOST_PROPERTY_NAME,
        blockLocationHost);
  }
  AzureBlobStorageTestAccount testAccount = AzureBlobStorageTestAccount
      .createMock(conf);
  FileSystem fs = testAccount.getFileSystem();
  Path testFile = createTestFile(fs, fileSize);
  FileStatus stat = fs.getFileStatus(testFile);
  BlockLocation[] locations = fs.getFileBlockLocations(stat, start, len);
  testAccount.cleanup();
  return locations;
}

Source File: TestCombineFileInputFormat.java From RDFS with Apache License 2.0

6 votes

@Override
protected LocatedFileStatus[] listLocatedStatus(JobConf job) throws IOException {
  Path[] files = getInputPaths(job);
  LocatedFileStatus[] results = new LocatedFileStatus[files.length];
  for (int i = 0; i < files.length; i++) {
    Path p = files[i];
    FileSystem fs = p.getFileSystem(job);
    FileStatus stat = fs.getFileStatus(p);
    if (stat.isDir()) {
      results[i] = new LocatedFileStatus(stat, null);
    } else {
      results[i] = new LocatedFileStatus(stat,
          fs.getFileBlockLocations(stat, 0, stat.getLen()));
    }
  }
  return results;
}

Source File: Util.java From iceberg with Apache License 2.0

6 votes

public static String[] blockLocations(CombinedScanTask task, Configuration conf) {
  Set<String> locationSets = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    Path path = new Path(f.file().path().toString());
    try {
      FileSystem fs = path.getFileSystem(conf);
      for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) {
        locationSets.addAll(Arrays.asList(b.getHosts()));
      }
    } catch (IOException ioe) {
      LOG.warn("Failed to get block locations for path {}", path, ioe);
    }
  }

  return locationSets.toArray(new String[0]);
}

Source File: BlurBlockPlacementPolicyDefaultTest.java From incubator-retired-blur with Apache License 2.0

6 votes

private void waitForReplication(FileSystem fileSystem, Path p, int replicas) throws IOException, InterruptedException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  boolean fail = true;
  while (fail) {
    fail = false;
    BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
    for (BlockLocation blockLocation : blockLocations) {
      System.out.println(blockLocation);
      String[] hosts = blockLocation.getHosts();
      if (hosts.length != replicas) {
        fail = true;
      }
    }
    Thread.sleep(1000);
  }
}

Source File: TestSmallBlock.java From hadoop-gpu with Apache License 2.0

6 votes

private void checkFile(FileSystem fileSys, Path name) throws IOException {
  BlockLocation[] locations = fileSys.getFileBlockLocations(
      fileSys.getFileStatus(name), 0, fileSize);
  assertEquals("Number of blocks", fileSize, locations.length);
  FSDataInputStream stm = fileSys.open(name);
  byte[] expected = new byte[fileSize];
  if (simulatedStorage) {
    for (int i = 0; i < expected.length; ++i) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    Random rand = new Random(seed);
    rand.nextBytes(expected);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[fileSize];
  stm.readFully(0, actual);
  checkAndEraseData(actual, 0, expected, "Read Sanity Test");
  stm.close();
}

Source File: MergeSortRowIdMatcher.java From incubator-retired-blur with Apache License 2.0

5 votes

private static String getFirstBlockId(FileSystem fileSystem, Path realFile) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(realFile);
  BlockLocation[] locations = fileSystem.getFileBlockLocations(fileStatus, 0, 1);
  HdfsBlockLocation location = (HdfsBlockLocation) locations[0];
  LocatedBlock locatedBlock = location.getLocatedBlock();
  ExtendedBlock block = locatedBlock.getBlock();
  return toNiceString(block.getBlockId());
}

Source File: BlurBlockPlacementPolicyDefaultTest.java From incubator-retired-blur with Apache License 2.0

5 votes

private void assertBlocksExistOnShardServer(FileSystem fileSystem, Path p, String shardServer) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
  for (BlockLocation blockLocation : blockLocations) {
    System.out.println(blockLocation);
    String[] hosts = blockLocation.getHosts();
    assertTrue(Arrays.asList(hosts).contains(shardServer));
  }
}

Source File: MultiFileSplit.java From hadoop with Apache License 2.0

5 votes

public String[] getLocations() throws IOException {
  HashSet<String> hostSet = new HashSet<String>();
  for (Path file : getPaths()) {
    FileSystem fs = file.getFileSystem(getJob());
    FileStatus status = fs.getFileStatus(file);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(status,
                                        0, status.getLen());
    if (blkLocations != null && blkLocations.length > 0) {
      addToSet(hostSet, blkLocations[0].getHosts());
    }
  }
  return hostSet.toArray(new String[hostSet.size()]);
}

Source File: TestFileLocalRead.java From RDFS with Apache License 2.0

5 votes

private void checkFile(FileSystem fileSys, Path name, int repl)
  throws IOException {
  boolean done = false;

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, fileSize);
    if (locations.length < numBlocks) {
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected;
  if (simulatedStorage) {
    expected = new byte[numBlocks * blockSize];
    for (int i= 0; i < expected.length; i++) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    expected = AppendTestUtil.randomBytes(seed, numBlocks*blockSize);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[numBlocks * blockSize];
  System.out.println("Verifying file ");
  stm.readFully(0, actual);
  stm.close();
  checkData(actual, 0, expected, "Read 1");
}

Source File: FileInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/** 
 * Generate the list of files and make them into FileSplits.
 */ 
public List<InputSplit> getSplits(JobContext job
                                  ) throws IOException {
  long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
  long maxSize = getMaxSplitSize(job);

  // generate splits
  List<InputSplit> splits = new ArrayList<InputSplit>();
  for (FileStatus file: listStatus(job)) {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(job, path)) { 
      long blockSize = file.getBlockSize();
      long splitSize = computeSplitSize(blockSize, minSize, maxSize);

      long bytesRemaining = length;
      while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
        int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
        splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
                                 blkLocations[blkIndex].getHosts()));
        bytesRemaining -= splitSize;
      }
      
      if (bytesRemaining != 0) {
        splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 
                   blkLocations[blkLocations.length-1].getHosts()));
      }
    } else if (length != 0) {
      splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
    } else { 
      //Create empty hosts array for zero length files
      splits.add(new FileSplit(path, 0, length, new String[0]));
    }
  }
  LOG.debug("Total # of splits: " + splits.size());
  return splits;
}

Source File: TestSafeMode.java From big-c with Apache License 2.0

5 votes

void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}

Source File: TestSafeMode.java From hadoop with Apache License 2.0

5 votes

void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}

Source File: AdmmIterationInputFormat.java From laser with Apache License 2.0

4 votes

public List<InputSplit> getSplits(JobContext job) throws IOException {
	Configuration conf = job.getConfiguration();
	int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0);
	if (0 == numMapTasks) {
		return super.getSplits(job);
	}

	// generate splits
	List<InputSplit> splits = new ArrayList<InputSplit>();
	List<FileStatus> files = listStatus(job);

	for (FileStatus file : files) {
		Path path = file.getPath();
		FileSystem fs = path.getFileSystem(job.getConfiguration());
		long length = file.getLen();
		BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0,
				length);
		if ((length != 0) && isSplitable(job, path)) {
			long blockSize = file.getBlockSize();
			long splitSize = Math.max(
					computeSplitSize(JAVA_OPTS, numMapTasks, length),
					blockSize);
			long splitLength = (long) (length / Math.ceil((double) length
					/ splitSize));
			long bytesRemaining = length;

			while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) {
				int blkIndex = getBlockIndex(blkLocations, length
						- bytesRemaining);
				splits.add(new FileSplit(path, length - bytesRemaining,
						splitLength, blkLocations[blkIndex].getHosts()));

				bytesRemaining -= splitLength;
			}

			if (bytesRemaining != 0) {
				splits.add(new FileSplit(path, length - bytesRemaining,
						bytesRemaining,
						blkLocations[blkLocations.length - 1].getHosts()));
			}
		} else if (length != 0) {
			splits.add(new FileSplit(path, 0, length, blkLocations[0]
					.getHosts()));
		} else {
			splits.add(new FileSplit(path, 0, length, new String[0]));
		}
	}

	// Save the number of input files in the job-conf
	job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
	job.getConfiguration().setInt("admm.iteration.num.map.tasks",
			splits.size());
	return splits;
}

Source File: TestBlocksWithNotEnoughRacks.java From big-c with Apache License 2.0

4 votes

@Test
public void testNodeDecomissionRespectsRackPolicy() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
  } finally {
    cluster.shutdown();
  }
}

Source File: TestHostsFiles.java From hadoop with Apache License 2.0

4 votes

@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}

Source File: TestDistributedFileSystem.java From hadoop with Apache License 2.0

4 votes

@Test
public void testStatistics() throws Exception {
  int lsLimit = 2;
  final Configuration conf = getTestConfiguration();
  conf.setInt(DFSConfigKeys.DFS_LIST_LIMIT, lsLimit);
  final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
  try {
    final FileSystem fs = cluster.getFileSystem();
    Path dir = new Path("/test");
    Path file = new Path(dir, "file");
    
    int readOps = DFSTestUtil.getStatistics(fs).getReadOps();
    int writeOps = DFSTestUtil.getStatistics(fs).getWriteOps();
    int largeReadOps = DFSTestUtil.getStatistics(fs).getLargeReadOps();
    fs.mkdirs(dir);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FSDataOutputStream out = fs.create(file, (short)1);
    out.close();
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FileStatus status = fs.getFileStatus(file);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(file, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(status, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    FSDataInputStream in = fs.open(file);
    in.close();
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setReplication(file, (short)2);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    Path file1 = new Path(dir, "file1");
    fs.rename(file, file1);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.getContentSummary(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    
    // Iterative ls test
    for (int i = 0; i < 10; i++) {
      Path p = new Path(dir, Integer.toString(i));
      fs.mkdirs(p);
      FileStatus[] list = fs.listStatus(dir);
      if (list.length > lsLimit) {
        // if large directory, then count readOps and largeReadOps by 
        // number times listStatus iterates
        int iterations = (int)Math.ceil((double)list.length/lsLimit);
        largeReadOps += iterations;
        readOps += iterations;
      } else {
        // Single iteration in listStatus - no large read operation done
        readOps++;
      }
      
      // writeOps incremented by 1 for mkdirs
      // readOps and largeReadOps incremented by 1 or more
      checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    }
    
    fs.getStatus(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileChecksum(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setPermission(file1, new FsPermission((short)0777));
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.setTimes(file1, 0L, 0L);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    fs.setOwner(file1, ugi.getUserName(), ugi.getGroupNames()[0]);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.delete(dir, true);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
  } finally {
    if (cluster != null) cluster.shutdown();
  }
  
}

Source File: TestDatanodeDeath.java From hadoop with Apache License 2.0

4 votes

static private void checkFile(FileSystem fileSys, Path name, int repl,
                       int numblocks, int filesize, long seed)
  throws IOException {
  boolean done = false;
  int attempt = 0;

  long len = fileSys.getFileStatus(name).getLen();
  assertTrue(name + " should be of size " + filesize +
             " but found to be of size " + len, 
             len == filesize);

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    attempt++;
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, filesize);

    if (locations.length < numblocks) {
      if (attempt > 100) {
        System.out.println("File " + name + " has only " +
                           locations.length + " blocks, " +
                           " but is expected to have " + numblocks +
                           " blocks.");
      }
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        if (attempt > 100) {
          System.out.println("File " + name + " has " +
                             locations.length + " blocks: " +
                             " The " + idx + " block has only " +
                             locations[idx].getHosts().length + 
                             " replicas but is expected to have " 
                             + repl + " replicas.");
        }
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected = AppendTestUtil.randomBytes(seed, fileSize);

  // do a sanity check. Read the file
  byte[] actual = new byte[filesize];
  stm.readFully(0, actual);
  checkData(actual, 0, expected, "Read 1");
}

Source File: WikipediaEventInputFormat.java From datawave with Apache License 2.0

4 votes

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    final Configuration conf = job.getConfiguration();
    
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    
    // generate splits
    List<InputSplit> splits = new ArrayList<>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    
    // Save the number of input files in the job-conf
    conf.setLong(NUM_INPUT_FILES, files.size());
    
    return splits;
}

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

4 votes

List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
    long maxSplitSize, long minSplitSize, ReadContext readContext)
    throws IOException {
  List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
  Filter filter = ParquetInputFormat.getFilter(configuration);

  long rowGroupsDropped = 0;
  long totalRowGroups = 0;

  for (Footer footer : footers) {
    final Path file = footer.getFile();
    LOG.debug("{}", file);
    FileSystem fs = file.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(file);
    ParquetMetadata parquetMetaData = footer.getParquetMetadata();
    List<BlockMetaData> blocks = parquetMetaData.getBlocks();

    List<BlockMetaData> filteredBlocks;

    totalRowGroups += blocks.size();
    filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
    rowGroupsDropped += blocks.size() - filteredBlocks.size();

    if (filteredBlocks.isEmpty()) {
      continue;
    }

    BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
    splits.addAll(
        generateSplits(
            filteredBlocks,
            fileBlockLocations,
            fileStatus,
            readContext.getRequestedSchema().toString(),
            readContext.getReadSupportMetadata(),
            minSplitSize,
            maxSplitSize)
        );
  }

  if (rowGroupsDropped > 0 && totalRowGroups > 0) {
    int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
    LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
  } else {
    LOG.info("There were no row groups that could be dropped due to filter predicates");
  }
  return splits;
}

Source File: TestHostsFiles.java From big-c with Apache License 2.0

4 votes

@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}

Java Code Examples for org.apache.hadoop.fs.FileSystem#getFileBlockLocations()