org.apache.hadoop.mapreduce.InputSplit#getLocations

Source File: TabletSplitSplit.java From datawave with Apache License 2.0

5 votes

/**
 * Collect a set of hosts from all child InputSplits.
 * 
 * @throws InterruptedException
 */
public String[] getLocations() throws IOException, InterruptedException {
    HashSet<String> hosts = new HashSet<>();
    for (InputSplit s : splits) {
        String[] hints = s.getLocations();
        if (hints != null && hints.length > 0) {
            Collections.addAll(hosts, hints);
        }
    }
    return hosts.toArray(new String[hosts.size()]);
}

Source File: RedisHashRecordReader.java From Redis-4.x-Cookbook with MIT License

5 votes

public void initialize(InputSplit split, TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException {
    host = split.getLocations()[0];
    prefix = ((RedisHashInputSplit) split).getPrefix();
    key = ((RedisHashInputSplit) split).getKey();
    String hashKey = prefix+":"+key;

    jedis = new Jedis(host);
    log.info("Connect to " + host);
    jedis.connect();
    jedis.getClient().setTimeoutInfinite();

    totalKVs = jedis.hlen(hashKey);
    keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator();
}

Source File: CompositeInputSplit.java From hadoop with Apache License 2.0

5 votes

/**
 * Collect a set of hosts from all child InputSplits.
 */
public String[] getLocations() throws IOException, InterruptedException {
  HashSet<String> hosts = new HashSet<String>();
  for (InputSplit s : splits) {
    String[] hints = s.getLocations();
    if (hints != null && hints.length > 0) {
      for (String host : hints) {
        hosts.add(host);
      }
    }
  }
  return hosts.toArray(new String[hosts.size()]);
}

Source File: JobSplit.java From hadoop with Apache License 2.0

5 votes

public SplitMetaInfo(InputSplit split, long startOffset) throws IOException {
  try {
    this.locations = split.getLocations();
    this.inputDataLength = split.getLength();
    this.startOffset = startOffset;
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Source File: CombineDocumentSplit.java From marklogic-contentpump with Apache License 2.0

5 votes

public CombineDocumentSplit(List<FileSplit> splits) 
throws IOException, InterruptedException {
    this.splits = splits;
    locations = new HashSet<String>();
    for (InputSplit split : splits) {
        length += split.getLength();
        for (String loc : split.getLocations()) {
            if (!locations.contains(loc)) {
                locations.add(loc);
            }
        }
    }
}

Source File: CompositeInputSplit.java From big-c with Apache License 2.0

5 votes

/**
 * Collect a set of hosts from all child InputSplits.
 */
public String[] getLocations() throws IOException, InterruptedException {
  HashSet<String> hosts = new HashSet<String>();
  for (InputSplit s : splits) {
    String[] hints = s.getLocations();
    if (hints != null && hints.length > 0) {
      for (String host : hints) {
        hosts.add(host);
      }
    }
  }
  return hosts.toArray(new String[hosts.size()]);
}

Source File: JobSplit.java From big-c with Apache License 2.0

5 votes

public SplitMetaInfo(InputSplit split, long startOffset) throws IOException {
  try {
    this.locations = split.getLocations();
    this.inputDataLength = split.getLength();
    this.startOffset = startOffset;
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Source File: PigSplit.java From spork with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
        HashMap<String, Long> locMap = new HashMap<String, Long>();
        Long lenInMap;
        for (InputSplit split : wrappedSplits)
        {
            String[] locs = split.getLocations();
            for (String loc : locs)
            {
                if ((lenInMap = locMap.get(loc)) == null)
                    locMap.put(loc, split.getLength());
                else
                    locMap.put(loc, lenInMap + split.getLength());
            }
        }
        Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
        Map.Entry<String, Long>[] hostSize =
            entrySet.toArray(new Map.Entry[entrySet.size()]);
        Arrays.sort(hostSize, new Comparator<Map.Entry<String, Long>>() {

          @Override
          public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
            long diff = o1.getValue() - o2.getValue();
            if (diff < 0) return 1;
            if (diff > 0) return -1;
            return 0;
          }
        });
        // maximum 5 locations are in list: refer to PIG-1648 for more details
        int nHost = Math.min(hostSize.length, 5);
        locations = new String[nHost];
        for (int i = 0; i < nHost; ++i) {
          locations[i] = hostSize[i].getKey();
        }
    }
    return locations;
}

Source File: TestCombineFileInputFormat.java From hadoop with Apache License 2.0

4 votes

@Test
public void testNodeDistribution() throws IOException, InterruptedException {
  DummyInputFormat inFormat = new DummyInputFormat();
  int numBlocks = 60;
  long totLength = 0;
  long blockSize = 100;
  int numNodes = 10;

  long minSizeNode = 50;
  long minSizeRack = 50;
  int maxSplitSize = 200; // 4 blocks per split.

  String[] locations = new String[numNodes];
  for (int i = 0; i < numNodes; i++) {
    locations[i] = "h" + i;
  }
  String[] racks = new String[0];
  Path path = new Path("hdfs://file");

  OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];

  int hostCountBase = 0;
  // Generate block list. Replication 3 per block.
  for (int i = 0; i < numBlocks; i++) {
    int localHostCount = hostCountBase;
    String[] blockHosts = new String[3];
    for (int j = 0; j < 3; j++) {
      int hostNum = localHostCount % numNodes;
      blockHosts[j] = "h" + hostNum;
      localHostCount++;
    }
    hostCountBase++;
    blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, blockHosts,
        racks);
    totLength += blockSize;
  }

  List<InputSplit> splits = new ArrayList<InputSplit>();
  HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
  HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
  HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
  Map<String, Set<OneBlockInfo>> nodeToBlocks = new TreeMap<String, Set<OneBlockInfo>>();

  OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes,
      nodeToBlocks, rackToNodes);
  
  inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength,
      maxSplitSize, minSizeNode, minSizeRack, splits);

  int expectedSplitCount = (int) (totLength / maxSplitSize);
  assertEquals(expectedSplitCount, splits.size());

  // Ensure 90+% of the splits have node local blocks.
  // 100% locality may not always be achieved.
  int numLocalSplits = 0;
  for (InputSplit inputSplit : splits) {
    assertEquals(maxSplitSize, inputSplit.getLength());
    if (inputSplit.getLocations().length == 1) {
      numLocalSplits++;
    }
  }
  assertTrue(numLocalSplits >= 0.9 * splits.size());
}

Source File: JobSplit.java From hadoop with Apache License 2.0

4 votes

public TaskSplitMetaInfo(InputSplit split, long startOffset) 
throws InterruptedException, IOException {
  this(new TaskSplitIndex("", startOffset), split.getLocations(), 
      split.getLength());
}

Source File: TestCombineFileInputFormat.java From big-c with Apache License 2.0

4 votes

@Test
public void testNodeDistribution() throws IOException, InterruptedException {
  DummyInputFormat inFormat = new DummyInputFormat();
  int numBlocks = 60;
  long totLength = 0;
  long blockSize = 100;
  int numNodes = 10;

  long minSizeNode = 50;
  long minSizeRack = 50;
  int maxSplitSize = 200; // 4 blocks per split.

  String[] locations = new String[numNodes];
  for (int i = 0; i < numNodes; i++) {
    locations[i] = "h" + i;
  }
  String[] racks = new String[0];
  Path path = new Path("hdfs://file");

  OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];

  int hostCountBase = 0;
  // Generate block list. Replication 3 per block.
  for (int i = 0; i < numBlocks; i++) {
    int localHostCount = hostCountBase;
    String[] blockHosts = new String[3];
    for (int j = 0; j < 3; j++) {
      int hostNum = localHostCount % numNodes;
      blockHosts[j] = "h" + hostNum;
      localHostCount++;
    }
    hostCountBase++;
    blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, blockHosts,
        racks);
    totLength += blockSize;
  }

  List<InputSplit> splits = new ArrayList<InputSplit>();
  HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
  HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
  HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
  Map<String, Set<OneBlockInfo>> nodeToBlocks = new TreeMap<String, Set<OneBlockInfo>>();

  OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes,
      nodeToBlocks, rackToNodes);
  
  inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength,
      maxSplitSize, minSizeNode, minSizeRack, splits);

  int expectedSplitCount = (int) (totLength / maxSplitSize);
  assertEquals(expectedSplitCount, splits.size());

  // Ensure 90+% of the splits have node local blocks.
  // 100% locality may not always be achieved.
  int numLocalSplits = 0;
  for (InputSplit inputSplit : splits) {
    assertEquals(maxSplitSize, inputSplit.getLength());
    if (inputSplit.getLocations().length == 1) {
      numLocalSplits++;
    }
  }
  assertTrue(numLocalSplits >= 0.9 * splits.size());
}

Source File: JobSplit.java From big-c with Apache License 2.0

4 votes

public TaskSplitMetaInfo(InputSplit split, long startOffset) 
throws InterruptedException, IOException {
  this(new TaskSplitIndex("", startOffset), split.getLocations(), 
      split.getLength());
}

Source File: CqlRecordReader.java From stratio-cassandra with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException
{
    this.split = (ColumnFamilySplit) split;
    Configuration conf = HadoopCompat.getConfiguration(context);
    totalRowCount = (this.split.getLength() < Long.MAX_VALUE)
                  ? (int) this.split.getLength()
                  : ConfigHelper.getInputSplitSize(conf);
    cfName = ConfigHelper.getInputColumnFamily(conf);
    keyspace = ConfigHelper.getInputKeyspace(conf);
    partitioner = ConfigHelper.getInputPartitioner(conf);
    inputColumns = CqlConfigHelper.getInputcolumns(conf);
    userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);

    try
    {
        if (cluster != null)
            return;

        // create a Cluster instance
        String[] locations = split.getLocations();
        cluster = CqlConfigHelper.getInputCluster(locations, conf);
    }
    catch (Exception e)
    {
        throw new RuntimeException(e);
    }

    if (cluster != null)
        session = cluster.connect(quote(keyspace));

    if (session == null)
      throw new RuntimeException("Can't create connection session");

    //get negotiated serialization protocol
    nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion();

    // If the user provides a CQL query then we will use it without validation
    // otherwise we will fall back to building a query using the:
    //   inputColumns
    //   whereClauses
    cqlQuery = CqlConfigHelper.getInputCql(conf);
    // validate that the user hasn't tried to give us a custom query along with input columns
    // and where clauses
    if (StringUtils.isNotEmpty(cqlQuery) && (StringUtils.isNotEmpty(inputColumns) ||
                                             StringUtils.isNotEmpty(userDefinedWhereClauses)))
    {
        throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
    }

    if (StringUtils.isEmpty(cqlQuery))
        cqlQuery = buildQuery();
    logger.debug("cqlQuery {}", cqlQuery);

    rowIterator = new RowIterator();
    logger.debug("created {}", rowIterator);
}

Java Code Examples for org.apache.hadoop.mapreduce.InputSplit#getLocations()