Java Code Examples for org.apache.hadoop.filecache.DistributedCache#getCacheFiles()
The following examples show how to use
org.apache.hadoop.filecache.DistributedCache#getCacheFiles() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CrossProductOperation.java From incubator-retired-mrql with Apache License 2.0 | 6 votes |
@Override protected void setup ( Context context ) throws IOException,InterruptedException { super.setup(context); try { conf = context.getConfiguration(); Plan.conf = conf; Config.read(Plan.conf); Tree code = Tree.parse(conf.get("mrql.reducer")); reduce_fnc = functional_argument(conf,code); code = Tree.parse(conf.get("mrql.mapper")); map_fnc = functional_argument(conf,code); if (conf.get("mrql.zero") != null) { code = Tree.parse(conf.get("mrql.zero")); result = Interpreter.evalE(code); code = Tree.parse(conf.get("mrql.accumulator")); acc_fnc = functional_argument(conf,code); } else result = null; counter = conf.get("mrql.counter"); uris = DistributedCache.getCacheFiles(conf); local_paths = DistributedCache.getLocalCacheFiles(conf); index = 0; } catch (Exception e) { throw new Error("Cannot setup the crossProduct: "+e); } }
Example 2
Source File: JobControlCompiler.java From spork with Apache License 2.0 | 6 votes |
private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException { URI[] cacheFileUris = DistributedCache.getCacheFiles(conf); if (cacheFileUris != null) { String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef(); for (URI cacheFileUri : cacheFileUris) { Path path = new Path(cacheFileUri); String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment(); // Match // - if both filenames are same and no symlinks (or) // - if both symlinks are same (or) // - symlink of existing cache file is same as the name of the new file to be added. // That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie // and register hbase.jar is done in the pig script. // If two different files are symlinked to the same name, then there is a conflict // and hadoop itself does not guarantee which file will be symlinked to that name. // So we are good. if (fileName.equals(cacheFileName)) { return path; } } } return null; }
Example 3
Source File: TestJobControlCompiler.java From spork with Apache License 2.0 | 5 votes |
@Test public void testAddArchiveToDistributedCache() throws IOException { final File textFile = File.createTempFile("file", ".txt"); textFile.deleteOnExit(); final List<File> zipArchives = createFiles(".zip"); zipArchives.add(textFile); final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar"); final PigServer pigServer = new PigServer(ExecType.MAPREDUCE); final PigContext pigContext = pigServer.getPigContext(); pigContext.connect(); pigContext.getProperties().put("pig.streaming.ship.files", StringUtils.join(zipArchives, ",")); pigContext.getProperties().put("pig.streaming.cache.files", StringUtils.join(tarArchives, ",")); final JobConf jobConf = compileTestJob(pigContext, CONF); URI[] uris = DistributedCache.getCacheFiles(jobConf); int sizeTxt = 0; for (int i = 0; i < uris.length; i++) { if (uris[i].toString().endsWith(".txt")) { sizeTxt++; } } Assert.assertTrue(sizeTxt == 1); assertFilesInDistributedCache( DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz", ".tar.gz", ".tar"); }
Example 4
Source File: TestJobControlCompiler.java From spork with Apache License 2.0 | 4 votes |
/** * Tests that no duplicate jars are added to distributed cache, which might cause conflicts * and tests with both symlinked and normal jar specification */ @Test public void testNoDuplicateJarsInDistributedCache() throws Exception { // JobControlCompiler setup final PigServer pigServer = new PigServer(ExecType.MAPREDUCE); PigContext pigContext = pigServer.getPigContext(); pigContext.connect(); Configuration conf = new Configuration(); DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf, FileSystem.get(conf)); DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf, FileSystem.get(conf)); DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf)); createAndAddResource("udf.jar", pigContext); createAndAddResource("udf1.jar", pigContext); createAndAddResource("udf2.jar", pigContext); createAndAddResource("another.jar", pigContext); final JobConf jobConf = compileTestJob(pigContext, conf); // verifying the jar gets on distributed cache URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf); Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf); // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice) // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar System.out.println("cache.files= " + Arrays.toString(cacheURIs)); System.out.println("classpath.files= " + Arrays.toString(fileClassPaths)); if (HadoopShims.isHadoopYARN()) { // Default jars - 5 (pig, antlr, joda-time, automaton) // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9, Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size()); Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9, Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size()); } else { // Default jars - 5. Has guava in addition // There will be same entries duplicated for udf.jar and udf2.jar Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12, Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size()); Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12, Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size()); } // Count occurrences of the resources Map<String, Integer> occurrences = new HashMap<String, Integer>(); for (URI cacheURI : cacheURIs) { Integer val = occurrences.get(cacheURI.toString()); val = (val == null) ? 1 : ++val; occurrences.put(cacheURI.toString(), val); } if (HadoopShims.isHadoopYARN()) { Assert.assertEquals(9, occurrences.size()); } else { Assert.assertEquals(10, occurrences.size()); //guava jar in addition } for (String file : occurrences.keySet()) { if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) { // Same path added twice which is ok. It should not be a shipped to hdfs temp path. // We assert path is same by checking count Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file)); } else { // check that only single occurrence even though we added once to dist cache (simulating via Oozie) // and second time through pig register jar when there is symlink Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file)); } } }
Example 5
Source File: Submitter.java From RDFS with Apache License 2.0 | 4 votes |
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); } if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, "mapred.mapoutput.key.class", textClassname); setIfUnset(conf, "mapred.mapoutput.value.class", textClassname); setIfUnset(conf, "mapred.output.key.class", textClassname); setIfUnset(conf, "mapred.output.value.class", textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass("mapred.pipes.user.inputformat", conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { DistributedCache.createSymlink(conf); // set default gdb commands for map and reduce task String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf,"mapred.map.task.debug.script",defScript); setIfUnset(conf,"mapred.reduce.task.debug.script",defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length+1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
Example 6
Source File: HadoopIOUtils.java From elasticsearch-hadoop with Apache License 2.0 | 4 votes |
public static InputStream open(String resource, Configuration conf) { ClassLoader loader = conf.getClassLoader(); if (loader == null) { loader = Thread.currentThread().getContextClassLoader(); } if (loader == null) { loader = HadoopIOUtils.class.getClassLoader(); } boolean trace = log.isTraceEnabled(); try { // no prefix means classpath if (!resource.contains(":")) { InputStream result = loader.getResourceAsStream(resource); if (result != null) { if (trace) { log.trace(String.format("Loaded resource %s from classpath", resource)); } return result; } // fall back to the distributed cache URI[] uris = DistributedCache.getCacheFiles(conf); if (uris != null) { for (URI uri : uris) { if (uri.toString().contains(resource)) { if (trace) { log.trace(String.format("Loaded resource %s from distributed cache", resource)); } return uri.toURL().openStream(); } } } } // fall back to file system Path p = new Path(resource); FileSystem fs = p.getFileSystem(conf); return fs.open(p); } catch (IOException ex) { throw new EsHadoopIllegalArgumentException(String.format("Cannot open stream for resource %s", resource)); } }