org.apache.hadoop.filecache.DistributedCache#getLocalCacheFiles

Source File: MapJoin.java From BigData-In-Practice with Apache License 2.0

6 votes

@Override
protected void setup(Mapper<LongWritable, Text, NullWritable, Emp_Dep>.Context context) throws IOException, InterruptedException {
    // 预处理把要关联的文件加载到缓存中
    Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
    // 我们这里只缓存了一个文件，所以取第一个即可，创建BufferReader去读取
    BufferedReader reader = new BufferedReader(new FileReader(paths[0].toString()));

    String str = null;
    try {
        // 一行一行读取
        while ((str = reader.readLine()) != null) {
            // 对缓存中的表进行分割
            String[] splits = str.split("\t");
            // 把字符数组中有用的数据存在一个Map中
            joinData.put(Integer.parseInt(splits[0]), splits[1]);
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        reader.close();
    }

}

Source File: CrossProductOperation.java From incubator-retired-mrql with Apache License 2.0

6 votes

@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}

Source File: AvroDistributedCacheFileReader.java From ml-ease with Apache License 2.0

6 votes

@Override
protected List<Path> getPaths(String filePath) throws IOException
{
  Path[] localFiles = DistributedCache.getLocalCacheFiles(getConf());
  List<Path> paths = new ArrayList<Path>();
  
  for (Path file: localFiles)
  {
    if (!file.toString().contains(filePath))
    {
      continue;
    }
    
    paths.add(file);
  }
    
  return paths;
}

Source File: L2.java From spork with Apache License 2.0

6 votes

public void configure(JobConf conf) {
    try {
        Path[] paths = DistributedCache.getLocalCacheFiles(conf);
        if (paths == null || paths.length < 1) {
            throw new RuntimeException("DistributedCache no work.");
        }

        // Open the small table
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(new
            FileInputStream(paths[0].toString())));
        String line;
        hash = new HashSet<String>(500);
        while ((line = reader.readLine()) != null) {
            if (line.length() < 1) continue;
            String[] fields = line.split("");
            hash.add(fields[0]);
        }
        reader.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}

Source File: ReplicatedUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	// Read all files in the DistributedCache
	for (Path p : files) {
		BufferedReader rdr = new BufferedReader(new InputStreamReader(
				new GZIPInputStream(new FileInputStream(new File(
						p.toString())))));
		String line = null;
		// For each record in the user file
		while ((line = rdr.readLine()) != null) {
			// Get the user ID for this record
			Map<String, String> parsed = MRDPUtils
					.transformXmlToMap(line);
			String userId = parsed.get("Id");
			// Map the user ID to the record
			userIdToInfo.put(userId, line);
		}
		rdr.close();
	}
	// Get the join type from the configuration
	joinType = context.getConfiguration().get("join.type");
}

Source File: MapFeatures.java From hadoop-book with Apache License 2.0

6 votes

@Override
public void configure(JobConf job) {
    caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
    inputFile = job.get("map.input.file");

    if (job.getBoolean("wordcount.skip.patterns", false)) {
        Path[] patternsFiles = new Path[0];
        try {
            patternsFiles = DistributedCache.getLocalCacheFiles(job);
        } catch (IOException ioe) {
            System.err.println("Caught exception getting cached files: "
                    + StringUtils.stringifyException(ioe));
        }
        for (Path patternsFile : patternsFiles) {
            parseSkipFile(patternsFile);
        }
    }
}

Source File: BloomJoin.java From hiped2 with Apache License 2.0

5 votes

@Override
protected void setup(
    Context context)
    throws IOException, InterruptedException {

  Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());
  filter = BloomFilterDumper.fromFile(
      new File(files[0].toString()));

  System.out.println("Filter = " + filter);
}

Source File: DistributedCacheHelper.java From datafu with Apache License 2.0

5 votes

/**
 * Deserializes an object from a path in HDFS.
 * 
 * @param conf Hadoop configuration
 * @param path Path to deserialize from
 * @return Deserialized object
 * @throws IOException IOException
 */
public static Object readObject(Configuration conf, org.apache.hadoop.fs.Path path) throws IOException
{
  String localPath = null;
  Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(conf);
  for (Path localCacheFile : localCacheFiles)
  {
    if (localCacheFile.getName().endsWith(path.getName()))
    {
      localPath = localCacheFile.getName();
      break;
    }
  }
  if (localPath == null)
  {
    throw new RuntimeException("Could not find " + path + " in local cache");
  }
  FileInputStream inputStream = new FileInputStream(new File(localPath));
  ObjectInputStream objStream = new ObjectInputStream(inputStream);
  
  try
  {
    try {
      return objStream.readObject();
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
  }
  finally
  {
    objStream.close();
    inputStream.close();
  }
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

protected void setup(Context context) throws IOException, InterruptedException {
	average = getAveragePostsPerUser(context.getConfiguration());
	mos = new MultipleOutputs<Text, Text>(context);

	try {
		Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());

		if (files == null || files.length == 0) {
			throw new RuntimeException("User information is not set in DistributedCache");
		}

		// Read all files in the DistributedCache
		for (Path p : files) {
			BufferedReader rdr = new BufferedReader(new InputStreamReader(
					new GZIPInputStream(new FileInputStream(new File(p.toString())))));

			String line;
			// For each record in the user file
			while ((line = rdr.readLine()) != null) {

				// Get the user ID and reputation
				Map<String, String> parsed = MRDPUtils.transformXmlToMap(line);
				String userId = parsed.get("Id");
				String reputation = parsed.get("Reputation");

				if (userId != null && reputation != null) {
					// Map the user ID to the reputation
					userIdToReputation.put(userId, reputation);
				}
			}
		}

	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Source File: ReduceSideJoinBloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

public void setup(Context context) throws IOException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	DataInputStream strm = new DataInputStream(new FileInputStream(
			new File(files[0].toString())));
	bfilter.readFields(strm);
}

Source File: BloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	System.out.println("Reading Bloom filter from: " + files[0]);

	DataInputStream stream = new DataInputStream(new FileInputStream(
			files[0].toString()));
	filter.readFields(stream);
	stream.close();
}

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

5 votes

public static Path[] getFileFromCache(String libPathStr,
		Configuration config) {
	Path[] localFiles = null;
	try {
		logger.info("Local Cache => " + DistributedCache.getLocalCacheFiles(config));
		logger.info("Hadoop Cache => "+ DistributedCache.getCacheFiles(config));
		if (DistributedCache.getLocalCacheFiles(config) != null) {
			localFiles = DistributedCache.getLocalCacheFiles(config);
		}
		logger.info("LocalFiles => " + localFiles);
	} catch (Exception e) {
		e.printStackTrace();
	}
	return localFiles;
}

Source File: FileCache.java From Cubert with Apache License 2.0

4 votes

public static void initialize(Configuration conf) throws IOException
{
    FileCache.conf = conf;
    cachedFiles = DistributedCache.getLocalCacheFiles(conf);
}

Source File: AccumuloMrGeoRangePartitioner.java From mrgeo with Apache License 2.0

4 votes

@SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification = "Cutpoints file generated by code")
private synchronized TileIdWritable[] getCutPoints() throws IOException
{
  if (cutPointArray == null)
  {
    String cutFileName = conf.get(CUTFILE_KEY);
    Path[] cf = DistributedCache.getLocalCacheFiles(conf);

    if (cf != null)
    {
      for (Path path : cf)
      {
        if (path.toUri().getPath().endsWith(cutFileName.substring(cutFileName.lastIndexOf('/'))))
        {
          TreeSet<Text> cutPoints = new TreeSet<Text>();
          try (Scanner in = new Scanner(new BufferedReader(new FileReader(path.toString()))))
          {
            while (in.hasNextLine())
            {
              cutPoints.add(new Text(Base64Utils.decodeToString(in.nextLine())));
            }
          }
          catch (ClassNotFoundException e)
          {
            throw new IOException("Error decoding cutpoints", e);
          }
          cutPointArray = cutPoints.toArray(new Text[cutPoints.size()]);
          break;
        }
      }
    }
    if (cutPointArray == null)
    {
      throw new FileNotFoundException(cutFileName + " not found in distributed cache");
    }
  }
  tileIdPointArray = new TileIdWritable[cutPointArray.length];
  for (int x = 0; x < cutPointArray.length; x++)
  {
    byte[] b = cutPointArray[x].getBytes();
    ByteBuffer buffer = ByteBuffer.wrap(b);
    long k = buffer.getLong();
    tileIdPointArray[x] = new TileIdWritable(k);
  }

  return tileIdPointArray;
}

Source File: GroupedKeyRangePartitioner.java From accumulo-recipes with Apache License 2.0

4 votes

private synchronized Text[] getCutPoints() throws IOException {
    if (cutPointArray == null) {

        Path[] cf = DistributedCache.getLocalCacheFiles(conf);
        if (cf != null) {
            Map<String, String> curFilesAndGroups = getCurFilesAndGroups();
            SortedMap<String, SortedSet<String>> cutPointMap = new TreeMap<String, SortedSet<String>>();
            for (Path path : cf) {
                String group = null;
                for (Map.Entry<String, String> groupSplits : curFilesAndGroups.entrySet()) {
                    if (path.toString().endsWith(groupSplits.getKey()))
                        group = groupSplits.getValue();
                }


                if (group != null) {
                    Scanner in = new Scanner(new BufferedReader(new FileReader(path.toString())));

                    try {
                        while (in.hasNextLine()) {
                            String split = new String(Base64.decodeBase64(in.nextLine().getBytes()));

                            SortedSet<String> splits = cutPointMap.get(group);
                            if (splits == null) {
                                splits = new TreeSet<String>();
                                cutPointMap.put(group, splits);
                            }
                        }

                        SortedSet<Text> treeSet = new TreeSet<Text>();
                        for (Map.Entry<String, SortedSet<String>> entry : cutPointMap.entrySet()) {
                            treeSet.add(new Text(entry.getKey() + NULL_BYTE + NULL_BYTE));

                            for (String string : entry.getValue())
                                treeSet.add(new Text(entry.getKey() + NULL_BYTE + string));

                            treeSet.add(new Text(entry.getKey() + NULL_BYTE + END_BYTE));
                        }

                        cutPointArray = treeSet.toArray(new Text[]{});
                    } finally {
                        in.close();
                    }

                    break;
                } else {
                    throw new FileNotFoundException("A file was not found in distribution cache files: " + path.toString());
                }
            }
        }
    }
    return cutPointArray;
}

Java Code Examples for org.apache.hadoop.filecache.DistributedCache#getLocalCacheFiles()