org.apache.hadoop.filecache.DistributedCache Java Exaples

Source File: JobControlCompiler.java From spork with Apache License 2.0

6 votes

private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException {
    URI[] cacheFileUris = DistributedCache.getCacheFiles(conf);
    if (cacheFileUris != null) {
        String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef();
        for (URI cacheFileUri : cacheFileUris) {
            Path path = new Path(cacheFileUri);
            String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment();
            // Match
            //     - if both filenames are same and no symlinks (or)
            //     - if both symlinks are same (or)
            //     - symlink of existing cache file is same as the name of the new file to be added.
            //         That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie
            // and register hbase.jar is done in the pig script.
            // If two different files are symlinked to the same name, then there is a conflict
            // and hadoop itself does not guarantee which file will be symlinked to that name.
            // So we are good.
            if (fileName.equals(cacheFileName)) {
                return path;
            }
        }
    }
    return null;
}

Source File: HiveSuite.java From elasticsearch-hadoop with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
@BeforeClass
public static void setup() throws Exception {
    if (!isLocal) {
        hadoopConfig = HdpBootstrap.hadoopConfig();

        HdfsUtils.copyFromLocal(Provisioner.ESHADOOP_TESTING_JAR, Provisioner.HDFS_ES_HDP_LIB);
        hdfsEsLib = HdfsUtils.qualify(Provisioner.HDFS_ES_HDP_LIB, hadoopConfig);
        // copy jar to DistributedCache
        try {
            DistributedCache.addArchiveToClassPath(new Path(Provisioner.HDFS_ES_HDP_LIB), hadoopConfig);
        } catch (IOException ex) {
            throw new RuntimeException("Cannot provision Hive", ex);
        }

        hdfsResource = "/eshdp/hive/hive-compund.dat";
        HdfsUtils.copyFromLocal(originalResource, hdfsResource);
        hdfsResource = HdfsUtils.qualify(hdfsResource, hadoopConfig);

        hdfsJsonResource = "/eshdp/hive/hive-compund.json";
        HdfsUtils.copyFromLocal(originalResource, hdfsJsonResource);
        hdfsJsonResource = HdfsUtils.qualify(hdfsJsonResource, hadoopConfig);
    }
}

Source File: BloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 3) {
		System.err
				.println("Usage: BloomFilter <bloom_filter_file> <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	DistributedCache.addCacheFile(new URI(otherArgs[0]), conf);
	Job job = new Job(conf, "Bloom Filter");
	job.setJarByClass(BloomFilter.class);
	job.setMapperClass(BloomFilterMapper.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}

Source File: ReplicatedUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	// Read all files in the DistributedCache
	for (Path p : files) {
		BufferedReader rdr = new BufferedReader(new InputStreamReader(
				new GZIPInputStream(new FileInputStream(new File(
						p.toString())))));
		String line = null;
		// For each record in the user file
		while ((line = rdr.readLine()) != null) {
			// Get the user ID for this record
			Map<String, String> parsed = MRDPUtils
					.transformXmlToMap(line);
			String userId = parsed.get("Id");
			// Map the user ID to the record
			userIdToInfo.put(userId, line);
		}
		rdr.close();
	}
	// Get the join type from the configuration
	joinType = context.getConfiguration().get("join.type");
}

Source File: TeraSort.java From hadoop-book with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Source File: MapFeatures.java From hadoop-book with Apache License 2.0

6 votes

@Override
public void configure(JobConf job) {
    caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
    inputFile = job.get("map.input.file");

    if (job.getBoolean("wordcount.skip.patterns", false)) {
        Path[] patternsFiles = new Path[0];
        try {
            patternsFiles = DistributedCache.getLocalCacheFiles(job);
        } catch (IOException ioe) {
            System.err.println("Caught exception getting cached files: "
                    + StringUtils.stringifyException(ioe));
        }
        for (Path patternsFile : patternsFiles) {
            parseSkipFile(patternsFile);
        }
    }
}

Source File: L2.java From spork with Apache License 2.0

6 votes

public void configure(JobConf conf) {
    try {
        Path[] paths = DistributedCache.getLocalCacheFiles(conf);
        if (paths == null || paths.length < 1) {
            throw new RuntimeException("DistributedCache no work.");
        }

        // Open the small table
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(new
            FileInputStream(paths[0].toString())));
        String line;
        hash = new HashSet<String>(500);
        while ((line = reader.readLine()) != null) {
            if (line.length() < 1) continue;
            String[] fields = line.split("");
            hash.add(fields[0]);
        }
        reader.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}

Source File: VisualJob.java From multimedia-indexing with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
    String inputPath = args[0];
    String outputPath = args[1];
    if (!IS_LOCAL & args.length >= 3) {
        String configFile = args[2];
        if (configFile != null) {
            getConf().addResource(configFile);
        }
        //The learning files have to be uploaded to the s3 bucket first
        //Then when starting the job, they have to be added to the hadoop distributed cache
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_0.csv#surf_l2_128c_0.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_1.csv#surf_l2_128c_1.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_2.csv#surf_l2_128c_2.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_3.csv#surf_l2_128c_3.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pca_surf_4x128_32768to1024.txt#pca_surf_4x128_32768to1024.txt"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/qcoarse_1024d_8192k.csv#qcoarse_1024d_8192k.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pq_1024_64x8_rp_ivf_8192k.csv#pq_1024_64x8_rp_ivf_8192k.csv"), getConf());
    }

    Job job = createJob(inputPath, outputPath);
    return job.waitForCompletion(true) ? 0 : -1;
}

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

6 votes

public static void loadJars(String libPathStr, Configuration config) {
	
	try {
		Path libPath = new Path(libPathStr);

		FileSystem fs = FileSystem.get(config);

		RemoteIterator<LocatedFileStatus> itr = fs.listFiles(libPath, true);

		while (itr.hasNext()) {
			LocatedFileStatus f = itr.next();

			if (!f.isDirectory() && f.getPath().getName().endsWith("jar")) {
				logger.info("Loading Jar : " + f.getPath().getName());
				DistributedCache.addFileToClassPath(f.getPath(), config);
			}
		}
	} catch (Exception e) {
		e.printStackTrace();
		logger.error(e.toString());
	}
}

Source File: SolrOutputFormat.java From examples with Apache License 2.0

6 votes

public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
    throws IOException {
  // Make a reasonably unique name for the zip file in the distributed cache
  // to avoid collisions if multiple jobs are running.
  String hdfsZipName = UUID.randomUUID().toString() + '.'
      + ZIP_FILE_BASE_NAME;
  Configuration jobConf = job.getConfiguration();
  jobConf.set(ZIP_NAME, hdfsZipName);

  Path zipPath = new Path("/tmp", getZipName(jobConf));
  FileSystem fs = FileSystem.get(jobConf);
  fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
  final URI baseZipUrl = fs.getUri().resolve(
      zipPath.toString() + '#' + getZipName(jobConf));

  DistributedCache.addCacheArchive(baseZipUrl, jobConf);
  LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
  LOG.debug("Set zipPath: {}", zipPath);
  // Actually send the path for the configuration zip file
  jobConf.set(SETUP_OK, zipPath.toString());
}

Source File: MapJoin.java From BigData-In-Practice with Apache License 2.0

6 votes

@Override
protected void setup(Mapper<LongWritable, Text, NullWritable, Emp_Dep>.Context context) throws IOException, InterruptedException {
    // 预处理把要关联的文件加载到缓存中
    Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
    // 我们这里只缓存了一个文件，所以取第一个即可，创建BufferReader去读取
    BufferedReader reader = new BufferedReader(new FileReader(paths[0].toString()));

    String str = null;
    try {
        // 一行一行读取
        while ((str = reader.readLine()) != null) {
            // 对缓存中的表进行分割
            String[] splits = str.split("\t");
            // 把字符数组中有用的数据存在一个Map中
            joinData.put(Integer.parseInt(splits[0]), splits[1]);
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        reader.close();
    }

}

Source File: AvroDistributedCacheFileReader.java From ml-ease with Apache License 2.0

6 votes

@Override
protected List<Path> getPaths(String filePath) throws IOException
{
  Path[] localFiles = DistributedCache.getLocalCacheFiles(getConf());
  List<Path> paths = new ArrayList<Path>();
  
  for (Path file: localFiles)
  {
    if (!file.toString().contains(filePath))
    {
      continue;
    }
    
    paths.add(file);
  }
    
  return paths;
}

Source File: CrossProductOperation.java From incubator-retired-mrql with Apache License 2.0

6 votes

@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}

Source File: TeraSort.java From hadoop-gpu with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Source File: DistributedCacheHelper.java From datafu with Apache License 2.0

5 votes

/**
 * Deserializes an object from a path in HDFS.
 * 
 * @param conf Hadoop configuration
 * @param path Path to deserialize from
 * @return Deserialized object
 * @throws IOException IOException
 */
public static Object readObject(Configuration conf, org.apache.hadoop.fs.Path path) throws IOException
{
  String localPath = null;
  Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(conf);
  for (Path localCacheFile : localCacheFiles)
  {
    if (localCacheFile.getName().endsWith(path.getName()))
    {
      localPath = localCacheFile.getName();
      break;
    }
  }
  if (localPath == null)
  {
    throw new RuntimeException("Could not find " + path + " in local cache");
  }
  FileInputStream inputStream = new FileInputStream(new File(localPath));
  ObjectInputStream objStream = new ObjectInputStream(inputStream);
  
  try
  {
    try {
      return objStream.readObject();
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
  }
  finally
  {
    objStream.close();
    inputStream.close();
  }
}

Source File: BloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	System.out.println("Reading Bloom filter from: " + files[0]);

	DataInputStream stream = new DataInputStream(new FileInputStream(
			files[0].toString()));
	filter.readFields(stream);
	stream.close();
}

Source File: ReduceSideJoinBloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

public void setup(Context context) throws IOException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	DataInputStream strm = new DataInputStream(new FileInputStream(
			new File(files[0].toString())));
	bfilter.readFields(strm);
}

Source File: QueryTestParams.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public <T extends Configuration> T provisionQueries(T cfg) {
    if (HadoopCfgUtils.isLocal(cfg)) {
        return cfg;
    }

    try {
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_DSL, stagingLocation).getAbsolutePath()), cfg);
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_URI, stagingLocation).getAbsolutePath()), cfg);
    } catch (IOException ex) {
    }
    return cfg;
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

protected void setup(Context context) throws IOException, InterruptedException {
	average = getAveragePostsPerUser(context.getConfiguration());
	mos = new MultipleOutputs<Text, Text>(context);

	try {
		Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());

		if (files == null || files.length == 0) {
			throw new RuntimeException("User information is not set in DistributedCache");
		}

		// Read all files in the DistributedCache
		for (Path p : files) {
			BufferedReader rdr = new BufferedReader(new InputStreamReader(
					new GZIPInputStream(new FileInputStream(new File(p.toString())))));

			String line;
			// For each record in the user file
			while ((line = rdr.readLine()) != null) {

				// Get the user ID and reputation
				Map<String, String> parsed = MRDPUtils.transformXmlToMap(line);
				String userId = parsed.get("Id");
				String reputation = parsed.get("Reputation");

				if (userId != null && reputation != null) {
					// Map the user ID to the reputation
					userIdToReputation.put(userId, reputation);
				}
			}
		}

	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Source File: MRWordCountFeatures.java From hadoop-book with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), MRWordCount.class);
    conf.setJobName(
            "WordCountFeatures");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapFeatures.class);
    conf.setCombinerClass(ReduceFeatures.class);
    conf.setReducerClass(ReduceFeatures.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

Source File: SolrRecordWriter.java From hbase-indexer with Apache License 2.0

5 votes

public static Path findSolrConfig(Configuration conf) throws IOException {
  // FIXME when mrunit supports the new cache apis
  //URI[] localArchives = context.getCacheArchives();
  Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
  for (Path unpackedDir : localArchives) {
    if (unpackedDir.getName().equals(SolrOutputFormat.getZipName(conf))) {
      LOG.info("Using this unpacked directory as solr home: {}", unpackedDir);
      return unpackedDir;
    }
  }
  throw new IOException(String.format(Locale.ENGLISH,
      "No local cache archives, where is %s:%s", SolrOutputFormat
          .getSetupOk(), SolrOutputFormat.getZipName(conf)));
}

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}

Source File: DependencyLoader.java From mrgeo with Apache License 2.0

5 votes

private static void addFileToClasspath(Configuration conf, Set<String> existing, FileSystem fs, Path hdfsBase,
    File file) throws IOException
{
  Path hdfsPath = new Path(hdfsBase, file.getName());
  if (!existing.contains(hdfsPath.toString()))
  {
    if (fs.exists(hdfsPath))
    {
      // check the timestamp and exit if the one in hdfs is "newer"
      FileStatus status = fs.getFileStatus(hdfsPath);

      if (file.lastModified() <= status.getModificationTime())
      {
        log.debug(file.getPath() + " up to date");
        DistributedCache.addFileToClassPath(hdfsPath, conf, fs);

        existing.add(hdfsPath.toString());
        return;
      }
    }

    // copy the file...
    log.debug("Copying " + file.getPath() + " to HDFS for distribution");

    fs.copyFromLocalFile(new Path(file.getCanonicalFile().toURI()), hdfsPath);
    DistributedCache.addFileToClassPath(hdfsPath, conf, fs);
    existing.add(hdfsPath.toString());
  }
}

Source File: GroupedKeyRangePartitioner.java From accumulo-recipes with Apache License 2.0

5 votes

/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void addSplitFile(JobContext job, String group, String file) {
    URI uri = new Path(file).toUri();
    DistributedCache.addCacheFile(uri, job.getConfiguration());
    String[] groups = job.getConfiguration().getStrings(GROUPS_KEY);
    if (groups == null || Arrays.binarySearch(groups, group) == -1) {
        String[] newGroups = groups != null ? Arrays.copyOf(groups, groups.length + 1) : new String[]{};
        newGroups[newGroups.length - 1] = group;
        job.getConfiguration().setStrings(GROUPS_KEY, newGroups);
        job.getConfiguration().set(GROUPS_KEY + "." + group, file);
    }
}

Source File: AccumuloMrGeoRangePartitioner.java From mrgeo with Apache License 2.0

5 votes

/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void setSplitFile(JobContext job, String file)
{
  URI uri = new Path(file).toUri();
  DistributedCache.addCacheFile(uri, job.getConfiguration());
  job.getConfiguration().set(CUTFILE_KEY, uri.getPath());
}

Source File: MRCompactorJobRunner.java From incubator-gobblin with Apache License 2.0

5 votes

private void addJars(Configuration conf) throws IOException {
  if (!this.dataset.jobProps().contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : this.fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, this.fs);
  }
}

Source File: Main.java From hiped2 with Apache License 2.0

5 votes

public static void runJob(Path inputPath,
                          Path smallFilePath,
                          Path outputPath)
    throws Exception {

  Configuration conf = new Configuration();

  FileSystem fs = smallFilePath.getFileSystem(conf);

  FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);

  if (smallFilePathStatus.isDir()) {
    for (FileStatus f : fs.listStatus(smallFilePath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(Main.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setNumReduceTasks(0);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}

Source File: CompactionJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void addJars(Configuration conf, State state, FileSystem fs) throws IOException {
  if (!state.contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(state.getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, fs);
  }
}

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

5 votes

public static void addFiletoCache(String libPathStr, Configuration config) {

		try {
			Path filePath = new Path(libPathStr);
			DistributedCache.addCacheFile(filePath.toUri(), config);
			// DistributedCache.createSymlink(config);

			// config.set("mapred.cache.files", libPathStr);
			// config.set("mapred.create.symlink", "yes");

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

Source File: AvroUtils.java From ml-ease with Apache License 2.0

5 votes

/**
 * Given a path to an output folder, it finds the existing "*.avro" files and adds 
 * them as cache files to be distributed. Throws an exception if no files are found/added.
 * 
 * @param conf Job configuration
 * @param outPath The path to the hdfs directory that has part files to cache
 * @throws Exception If no file is found at outPath throws a RuntimeException 
 */
public static void addAvroCacheFiles(JobConf conf, Path outPath) throws Exception
{
   FileStatus[] partFiles = getAvroPartFiles(conf, outPath);
   if (partFiles.length == 0)
   {      
     throw new RuntimeException("DistributedCacheFileUtils: No (part) file is found to cache at location:" + outPath );
   }
   
   for (FileStatus partFile : partFiles)
   {
     // add the file and set fileRead to true, since we have read at least one file
     DistributedCache.addCacheFile(partFile.getPath().toUri(), conf);
   }
 }

org.apache.hadoop.filecache.DistributedCache Java Examples