org.apache.hadoop.hdfs.MiniDFSCluster#restartDataNode

Source File: TestPendingCorruptDnMessages.java From hadoop with Apache License 2.0

5 votes

private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}

Source File: TestProcessCorruptBlocks.java From hadoop with Apache License 2.0

5 votes

/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestProcessCorruptBlocks.java From hadoop with Apache License 2.0

5 votes

private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}

Source File: TestProcessCorruptBlocks.java From big-c with Apache License 2.0

5 votes

private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}

Source File: TestPendingCorruptDnMessages.java From big-c with Apache License 2.0

5 votes

private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}

Source File: TestProcessCorruptBlocks.java From big-c with Apache License 2.0

5 votes

/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestStandbyIsHot.java From big-c with Apache License 2.0

4 votes

/**
 * Regression test for HDFS-2795:
 *  - Start an HA cluster with a DN.
 *  - Write several blocks to the FS with replication 1.
 *  - Shutdown the DN
 *  - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
 *  - Restart the DN.
 * In the bug, the standby node would only very slowly notice the blocks returning
 * to the cluster.
 */
@Test(timeout=60000)
public void testDatanodeRestarts() throws Exception {
  Configuration conf = new Configuration();
  conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
  // We read from the standby to watch block locations
  HAUtil.setAllowStandbyReads(conf, true);
  conf.setLong(DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 0);
  conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .nnTopology(MiniDFSNNTopology.simpleHATopology())
    .numDataNodes(1)
    .build();
  try {
    NameNode nn0 = cluster.getNameNode(0);
    NameNode nn1 = cluster.getNameNode(1);

    cluster.transitionToActive(0);
    
    // Create 5 blocks.
    DFSTestUtil.createFile(cluster.getFileSystem(0), 
        TEST_FILE_PATH, 5*1024, (short)1, 1L);
    
    HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
    
    // Stop the DN.
    DataNode dn = cluster.getDataNodes().get(0);
    String dnName = dn.getDatanodeId().getXferAddr(); 
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    
    // Make sure both NNs register it as dead.
    BlockManagerTestUtil.noticeDeadDatanode(nn0, dnName);
    BlockManagerTestUtil.noticeDeadDatanode(nn1, dnName);
    
    BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
    BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
    assertEquals(5, nn0.getNamesystem().getUnderReplicatedBlocks());
    
    // The SBN will not have any blocks in its neededReplication queue
    // since the SBN doesn't process replication.
    assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
    
    LocatedBlocks locs = nn1.getRpcServer().getBlockLocations(
        TEST_FILE, 0, 1);
    assertEquals("Standby should have registered that the block has no replicas",
        0, locs.get(0).getLocations().length);
    
    cluster.restartDataNode(dnProps);
    // Wait for both NNs to re-register the DN.
    cluster.waitActive(0);
    cluster.waitActive(1);
    
    BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
    BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
    assertEquals(0, nn0.getNamesystem().getUnderReplicatedBlocks());
    assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
    
    locs = nn1.getRpcServer().getBlockLocations(
        TEST_FILE, 0, 1);
    assertEquals("Standby should have registered that the block has replicas again",
        1, locs.get(0).getLocations().length);
  } finally {
    cluster.shutdown();
  }
}

Source File: TestNodeCount.java From RDFS with Apache License 2.0

4 votes

public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new Configuration();
  conf.setInt("dfs.replication.interval", 10);
  final short REPLICATION_FACTOR = (short)2;
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
       namesystem.heartbeats.toArray(new DatanodeDescriptor[REPLICATION_FACTOR]);
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive(false);
    
    LOG.info("Bringing down first DN");
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      datanode.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }

    LOG.info("Waiting for block to be replicated");
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    LOG.info("Restarting first datanode");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);

    LOG.info("Waiting for excess replicas to be detected");
    
    // check if excessive replica is detected
    waitForExcessReplicasToChange(namesystem, block, 1);

    LOG.info("Finding a non-excess node");

    // find out a non-excess node
    Iterator<DatanodeDescriptor> iter = namesystem.blocksMap.nodeIterator(block);
    DatanodeDescriptor nonExcessDN = null;
    while (iter.hasNext()) {
      DatanodeDescriptor dn = iter.next();
      Collection<Block> blocks = namesystem.excessReplicateMap.get(dn.getStorageID());
      if (blocks == null || !blocks.contains(block) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);

    LOG.info("Stopping non-excess node: " + nonExcessDN);
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      nonExcessDN.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }
    
    LOG.info("Waiting for live replicas to hit repl factor");
    // The block should be replicated
    NumberReplicas num;
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
    } while (num.liveReplicas() != REPLICATION_FACTOR);
    
    LOG.info("Restarting first DN");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);
    // check if excessive replica is detected
    LOG.info("Waiting for excess replicas to be detected");
    waitForExcessReplicasToChange(namesystem, block, 2);
  } finally {
    cluster.shutdown();
  }
}

Source File: TestUnderReplicatedBlocks.java From RDFS with Apache License 2.0

4 votes

public void testUnderReplicationWithDecommissionDataNode() throws Exception {
  final Configuration conf = new Configuration();
  final short REPLICATION_FACTOR = (short)1;
  File f = new File(HOST_FILE_PATH);
  if (f.exists()) {
    f.delete();
  }
  conf.set("dfs.hosts.exclude", HOST_FILE_PATH);
  LOG.info("Start the cluster");
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
          namesystem.heartbeats.toArray(
              new DatanodeDescriptor[REPLICATION_FACTOR]);
    assertEquals(1, datanodes.length);
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile2");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // shutdown the datanode
    DataNodeProperties dnprop = shutdownDataNode(cluster, datanodes[0]);
    assertEquals(1, namesystem.getMissingBlocksCount()); // one missing block
    assertEquals(0, namesystem.getNonCorruptUnderReplicatedBlocks());

    // Make the only datanode to be decommissioned
    LOG.info("Decommission the datanode " + dnprop);
    addToExcludeFile(namesystem.getConf(), datanodes);
    namesystem.refreshNodes(namesystem.getConf());      
    
    // bring up the datanode
    cluster.restartDataNode(dnprop);

    // Wait for block report
    LOG.info("wait for its block report to come in");
    NumberReplicas num;
    long startTime = System.currentTimeMillis();
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
     Thread.sleep(1000);
     LOG.info("live: " + num.liveReplicas() 
         + "Decom: " + num.decommissionedReplicas());
    } while (num.decommissionedReplicas() != 1 &&
        System.currentTimeMillis() - startTime < 30000);
    assertEquals("Decommissioning Replicas doesn't reach 1", 
        1, num.decommissionedReplicas());
    assertEquals(1, namesystem.getNonCorruptUnderReplicatedBlocks());
    assertEquals(0, namesystem.getMissingBlocksCount());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestDataNodeMultipleRegistrations.java From big-c with Apache License 2.0

4 votes

@Test
public void testDNWithInvalidStorageWithHA() throws Exception {
  MiniDFSNNTopology top = new MiniDFSNNTopology()
    .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
      .addNN(new MiniDFSNNTopology.NNConf("nn0").setClusterId("cluster-1"))
      .addNN(new MiniDFSNNTopology.NNConf("nn1").setClusterId("cluster-1")));

  top.setFederation(true);

  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(top)
      .numDataNodes(0).build();
  try {
    cluster.startDataNodes(conf, 1, true, null, null);
    // let the initialization be complete
    Thread.sleep(10000);
    DataNode dn = cluster.getDataNodes().get(0);
    assertTrue("Datanode should be running", dn.isDatanodeUp());
    assertEquals("BPOfferService should be running", 1,
        dn.getAllBpOs().length);
    DataNodeProperties dnProp = cluster.stopDataNode(0);

    cluster.getNameNode(0).stop();
    cluster.getNameNode(1).stop();
    Configuration nn1 = cluster.getConfiguration(0);
    Configuration nn2 = cluster.getConfiguration(1);
    // setting up invalid cluster
    StartupOption.FORMAT.setClusterId("cluster-2");
    DFSTestUtil.formatNameNode(nn1);
    MiniDFSCluster.copyNameDirs(FSNamesystem.getNamespaceDirs(nn1),
        FSNamesystem.getNamespaceDirs(nn2), nn2);
    cluster.restartNameNode(0, false);
    cluster.restartNameNode(1, false);
    cluster.restartDataNode(dnProp);
    
    // let the initialization be complete
    Thread.sleep(10000);
    dn = cluster.getDataNodes().get(0);
    assertFalse("Datanode should have shutdown as only service failed",
        dn.isDatanodeUp());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestOverReplicatedBlocks.java From big-c with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
@Test
public void testProcesOverReplicateBlock() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100L);
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
      Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    assertTrue(cluster.corruptReplica(0, block));
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanCursor = new File(new File(MiniDFSCluster.getFinalizedDir(
        cluster.getInstanceStorageDir(0, 0),
        cluster.getNamesystem().getBlockPoolId()).getParent()).getParent(),
        "scanner.cursor");
    //wait for one minute for deletion to succeed;
    for(int i = 0; !scanCursor.delete(); i++) {
      assertTrue("Could not delete " + scanCursor.getAbsolutePath() +
          " in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    String blockPoolId = cluster.getNamesystem().getBlockPoolId();
    final DatanodeID corruptDataNode = 
      DataNodeTestUtils.getDNRegistrationForBP(
          cluster.getDataNodes().get(2), blockPoolId);
       
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    try {
      namesystem.writeLock();
      synchronized(hm) {
        // set live datanode's remaining space to be 0 
        // so they will be chosen to be deleted when over-replication occurs
        String corruptMachineName = corruptDataNode.getXferAddr();
        for (DatanodeDescriptor datanode : hm.getDatanodes()) {
          if (!corruptMachineName.equals(datanode.getXferAddr())) {
            datanode.getStorageInfos()[0].setUtilizationForTesting(100L, 100L, 0, 100L);
            datanode.updateHeartbeat(
                BlockManagerTestUtil.getStorageReportsForDatanode(datanode),
                0L, 0L, 0, 0, null);
          }
        }

        // decrease the replication factor to 1; 
        NameNodeAdapter.setReplication(namesystem, fileName.toString(), (short)1);

        // corrupt one won't be chosen to be excess one
        // without 4910 the number of live replicas would be 0: block gets lost
        assertEquals(1, bm.countNodes(block.getLocalBlock()).liveReplicas());
      }
    } finally {
      namesystem.writeUnlock();
    }
    
  } finally {
    cluster.shutdown();
  }
}

Source File: TestBlocksWithNotEnoughRacks.java From big-c with Apache License 2.0

4 votes

@Test
public void testCorruptBlockRereplicatedAcrossRacks() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  int fileLen = 512;
  final Path filePath = new Path("/testFile");
  // Datanodes are spread across two racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block with a replication factor of 2
    final FileSystem fs = cluster.getFileSystem();
    
    DFSTestUtil.createFile(fs, filePath, fileLen, REPLICATION_FACTOR, 1L);
    final String fileContent = DFSTestUtil.readFile(fs, filePath);

    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Corrupt a replica of the block
    int dnToCorrupt = DFSTestUtil.firstDnWithBlock(cluster, b);
    assertTrue(cluster.corruptReplica(dnToCorrupt, b));

    // Restart the datanode so blocks are re-scanned, and the corrupt
    // block is detected.
    cluster.restartDataNode(dnToCorrupt);

    // Wait for the namenode to notice the corrupt replica
    DFSTestUtil.waitCorruptReplicas(fs, ns, filePath, b, 1);

    // The rack policy is still respected
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Ensure all replicas are valid (the corrupt replica may not
    // have been cleaned up yet).
    for (int i = 0; i < racks.length; i++) {
      String blockContent = cluster.readBlockOnDataNode(i, b);
      if (blockContent != null && i != dnToCorrupt) {
        assertEquals("Corrupt replica", fileContent, blockContent);
      }
    }
  } finally {
    cluster.shutdown();
  }
}

Source File: TestNodeCount.java From big-c with Apache License 2.0

4 votes

@Test
public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new HdfsConfiguration();
  final MiniDFSCluster cluster = 
    new MiniDFSCluster.Builder(conf).numDataNodes(REPLICATION_FACTOR).build();
  try {
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    final DatanodeDescriptor[] datanodes = hm.getDatanodes();
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getXferAddr());
    
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), datanode.getXferAddr());
    
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() == 0) {
      checkTimeout("excess replicas not detected");
    }
    
    // find out a non-excess node
    DatanodeDescriptor nonExcessDN = null;
    for(DatanodeStorageInfo storage : bm.blocksMap.getStorages(block.getLocalBlock())) {
      final DatanodeDescriptor dn = storage.getDatanodeDescriptor();
      Collection<Block> blocks = bm.excessReplicateMap.get(dn.getDatanodeUuid());
      if (blocks == null || !blocks.contains(block.getLocalBlock()) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);
    
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getXferAddr());
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), nonExcessDN.getXferAddr());

    // The block should be replicated
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).liveReplicas() != REPLICATION_FACTOR) {
      checkTimeout("live replica count not correct", 1000);
    }

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();

    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() != 2) {
      checkTimeout("excess replica count not equal to 2");
    }

  } finally {
    cluster.shutdown();
  }
}

Source File: TestNetworkTopology.java From hadoop with Apache License 2.0

4 votes

@Test(timeout=180000)
public void testInvalidNetworkTopologiesNotCachedInHdfs() throws Exception {
  // start a cluster
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = null;
  try {
    // bad rack topology
    String racks[] = { "/a/b", "/c" };
    String hosts[] = { "foo1.example.com", "foo2.example.com" };
    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).
        racks(racks).hosts(hosts).build();
    cluster.waitActive();
    
    NamenodeProtocols nn = cluster.getNameNodeRpc();
    Assert.assertNotNull(nn);
    
    // Wait for one DataNode to register.
    // The other DataNode will not be able to register up because of the rack mismatch.
    DatanodeInfo[] info;
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      Assert.assertFalse(info.length == 2);
      if (info.length == 1) {
        break;
      }
      Thread.sleep(1000);
    }
    // Set the network topology of the other node to the match the network
    // topology of the node that came up.
    int validIdx = info[0].getHostName().equals(hosts[0]) ? 0 : 1;
    int invalidIdx = validIdx == 1 ? 0 : 1;
    StaticMapping.addNodeToRack(hosts[invalidIdx], racks[validIdx]);
    LOG.info("datanode " + validIdx + " came up with network location " + 
      info[0].getNetworkLocation());

    // Restart the DN with the invalid topology and wait for it to register.
    cluster.restartDataNode(invalidIdx);
    Thread.sleep(5000);
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      if (info.length == 2) {
        break;
      }
      if (info.length == 0) {
        LOG.info("got no valid DNs");
      } else if (info.length == 1) {
        LOG.info("got one valid DN: " + info[0].getHostName() +
            " (at " + info[0].getNetworkLocation() + ")");
      }
      Thread.sleep(1000);
    }
    Assert.assertEquals(info[0].getNetworkLocation(),
                        info[1].getNetworkLocation());
  } finally {
    if (cluster != null) {
      cluster.shutdown();
    }
  }
}

Source File: TestNetworkTopology.java From big-c with Apache License 2.0

4 votes

@Test(timeout=180000)
public void testInvalidNetworkTopologiesNotCachedInHdfs() throws Exception {
  // start a cluster
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = null;
  try {
    // bad rack topology
    String racks[] = { "/a/b", "/c" };
    String hosts[] = { "foo1.example.com", "foo2.example.com" };
    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).
        racks(racks).hosts(hosts).build();
    cluster.waitActive();
    
    NamenodeProtocols nn = cluster.getNameNodeRpc();
    Assert.assertNotNull(nn);
    
    // Wait for one DataNode to register.
    // The other DataNode will not be able to register up because of the rack mismatch.
    DatanodeInfo[] info;
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      Assert.assertFalse(info.length == 2);
      if (info.length == 1) {
        break;
      }
      Thread.sleep(1000);
    }
    // Set the network topology of the other node to the match the network
    // topology of the node that came up.
    int validIdx = info[0].getHostName().equals(hosts[0]) ? 0 : 1;
    int invalidIdx = validIdx == 1 ? 0 : 1;
    StaticMapping.addNodeToRack(hosts[invalidIdx], racks[validIdx]);
    LOG.info("datanode " + validIdx + " came up with network location " + 
      info[0].getNetworkLocation());

    // Restart the DN with the invalid topology and wait for it to register.
    cluster.restartDataNode(invalidIdx);
    Thread.sleep(5000);
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      if (info.length == 2) {
        break;
      }
      if (info.length == 0) {
        LOG.info("got no valid DNs");
      } else if (info.length == 1) {
        LOG.info("got one valid DN: " + info[0].getHostName() +
            " (at " + info[0].getNetworkLocation() + ")");
      }
      Thread.sleep(1000);
    }
    Assert.assertEquals(info[0].getNetworkLocation(),
                        info[1].getNetworkLocation());
  } finally {
    if (cluster != null) {
      cluster.shutdown();
    }
  }
}

Source File: TestDataNodeMultipleRegistrations.java From hadoop with Apache License 2.0

4 votes

@Test
public void testDNWithInvalidStorageWithHA() throws Exception {
  MiniDFSNNTopology top = new MiniDFSNNTopology()
    .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
      .addNN(new MiniDFSNNTopology.NNConf("nn0").setClusterId("cluster-1"))
      .addNN(new MiniDFSNNTopology.NNConf("nn1").setClusterId("cluster-1")));

  top.setFederation(true);

  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(top)
      .numDataNodes(0).build();
  try {
    cluster.startDataNodes(conf, 1, true, null, null);
    // let the initialization be complete
    Thread.sleep(10000);
    DataNode dn = cluster.getDataNodes().get(0);
    assertTrue("Datanode should be running", dn.isDatanodeUp());
    assertEquals("BPOfferService should be running", 1,
        dn.getAllBpOs().length);
    DataNodeProperties dnProp = cluster.stopDataNode(0);

    cluster.getNameNode(0).stop();
    cluster.getNameNode(1).stop();
    Configuration nn1 = cluster.getConfiguration(0);
    Configuration nn2 = cluster.getConfiguration(1);
    // setting up invalid cluster
    StartupOption.FORMAT.setClusterId("cluster-2");
    DFSTestUtil.formatNameNode(nn1);
    MiniDFSCluster.copyNameDirs(FSNamesystem.getNamespaceDirs(nn1),
        FSNamesystem.getNamespaceDirs(nn2), nn2);
    cluster.restartNameNode(0, false);
    cluster.restartNameNode(1, false);
    cluster.restartDataNode(dnProp);
    
    // let the initialization be complete
    Thread.sleep(10000);
    dn = cluster.getDataNodes().get(0);
    assertFalse("Datanode should have shutdown as only service failed",
        dn.isDatanodeUp());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestOverReplicatedBlocks.java From RDFS with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    int namespaceId = cluster.getNameNode().getNamespaceID();
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0, cluster);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(cluster.getBlockDirectory("data1").getParent(), "dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    scanLog.delete();
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).getDNRegistrationForNS(namespaceId);
    final FSNamesystem namesystem = cluster.getNameNode().getNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 100L, 0);
        }
      }
    }
      
    // decrease the replication factor to 1; 
    namesystem.setReplication(fileName.toString(), (short)1);
    waitReplication(namesystem, block, (short)1);
    
    // corrupt one won't be chosen to be excess one
    // without 4910 the number of live replicas would be 0: block gets lost
    assertEquals(1, namesystem.countNodes(block).liveReplicas());

    // Test the case when multiple calls to setReplication still succeeds.
    System.out.println("Starting next test with file foo2.");
    final Path fileName2 = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName2, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName2, (short)3);
    LocatedBlocks lbs = namesystem.getBlockLocations(
               fileName2.toString(), 0, 10);
    Block firstBlock = lbs.get(0).getBlock();
    namesystem.setReplication(fileName2.toString(), (short)2);
    namesystem.setReplication(fileName2.toString(), (short)1);
    
    // wait upto one minute for excess replicas to get deleted. It is not
    // immediate because excess replicas are being handled asyncronously.
    waitReplication(namesystem, firstBlock, (short)1);
    assertEquals(1, namesystem.countNodes(firstBlock).liveReplicas());
  } finally {
    cluster.shutdown();
  }
}

Source File: TestBlocksWithNotEnoughRacks.java From hadoop with Apache License 2.0

4 votes

@Test
public void testCorruptBlockRereplicatedAcrossRacks() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  int fileLen = 512;
  final Path filePath = new Path("/testFile");
  // Datanodes are spread across two racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block with a replication factor of 2
    final FileSystem fs = cluster.getFileSystem();
    
    DFSTestUtil.createFile(fs, filePath, fileLen, REPLICATION_FACTOR, 1L);
    final String fileContent = DFSTestUtil.readFile(fs, filePath);

    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Corrupt a replica of the block
    int dnToCorrupt = DFSTestUtil.firstDnWithBlock(cluster, b);
    assertTrue(cluster.corruptReplica(dnToCorrupt, b));

    // Restart the datanode so blocks are re-scanned, and the corrupt
    // block is detected.
    cluster.restartDataNode(dnToCorrupt);

    // Wait for the namenode to notice the corrupt replica
    DFSTestUtil.waitCorruptReplicas(fs, ns, filePath, b, 1);

    // The rack policy is still respected
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Ensure all replicas are valid (the corrupt replica may not
    // have been cleaned up yet).
    for (int i = 0; i < racks.length; i++) {
      String blockContent = cluster.readBlockOnDataNode(i, b);
      if (blockContent != null && i != dnToCorrupt) {
        assertEquals("Corrupt replica", fileContent, blockContent);
      }
    }
  } finally {
    cluster.shutdown();
  }
}

Source File: TestNodeCount.java From hadoop with Apache License 2.0

4 votes

@Test
public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new HdfsConfiguration();
  final MiniDFSCluster cluster = 
    new MiniDFSCluster.Builder(conf).numDataNodes(REPLICATION_FACTOR).build();
  try {
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    final DatanodeDescriptor[] datanodes = hm.getDatanodes();
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getXferAddr());
    
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), datanode.getXferAddr());
    
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() == 0) {
      checkTimeout("excess replicas not detected");
    }
    
    // find out a non-excess node
    DatanodeDescriptor nonExcessDN = null;
    for(DatanodeStorageInfo storage : bm.blocksMap.getStorages(block.getLocalBlock())) {
      final DatanodeDescriptor dn = storage.getDatanodeDescriptor();
      Collection<Block> blocks = bm.excessReplicateMap.get(dn.getDatanodeUuid());
      if (blocks == null || !blocks.contains(block.getLocalBlock()) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);
    
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getXferAddr());
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), nonExcessDN.getXferAddr());

    // The block should be replicated
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).liveReplicas() != REPLICATION_FACTOR) {
      checkTimeout("live replica count not correct", 1000);
    }

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();

    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() != 2) {
      checkTimeout("excess replica count not equal to 2");
    }

  } finally {
    cluster.shutdown();
  }
}

Source File: TestOverReplicatedBlocks.java From hadoop-gpu with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(System.getProperty("test.build.data"),
        "dfs/data/data1/current/dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    for(int i=0; !scanLog.delete(); i++) {
      assertTrue("Could not delete log file in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).dnRegistration;
    final FSNamesystem namesystem = FSNamesystem.getFSNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 0);
        }
      }
      
      // decrease the replication factor to 1; 
      namesystem.setReplication(fileName.toString(), (short)1);

      // corrupt one won't be chosen to be excess one
      // without 4910 the number of live replicas would be 0: block gets lost
      assertEquals(1, namesystem.countNodes(block).liveReplicas());
    }
  } finally {
    cluster.shutdown();
  }
}

Java Code Examples for org.apache.hadoop.hdfs.MiniDFSCluster#restartDataNode()