org.apache.hadoop.filecache.DistributedCache#getCacheFiles

Source File: CrossProductOperation.java From incubator-retired-mrql with Apache License 2.0

6 votes

@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}

Source File: JobControlCompiler.java From spork with Apache License 2.0

6 votes

private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException {
    URI[] cacheFileUris = DistributedCache.getCacheFiles(conf);
    if (cacheFileUris != null) {
        String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef();
        for (URI cacheFileUri : cacheFileUris) {
            Path path = new Path(cacheFileUri);
            String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment();
            // Match
            //     - if both filenames are same and no symlinks (or)
            //     - if both symlinks are same (or)
            //     - symlink of existing cache file is same as the name of the new file to be added.
            //         That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie
            // and register hbase.jar is done in the pig script.
            // If two different files are symlinked to the same name, then there is a conflict
            // and hadoop itself does not guarantee which file will be symlinked to that name.
            // So we are good.
            if (fileName.equals(cacheFileName)) {
                return path;
            }
        }
    }
    return null;
}

Source File: TestJobControlCompiler.java From spork with Apache License 2.0

5 votes

@Test
public void testAddArchiveToDistributedCache() throws IOException {
    final File textFile = File.createTempFile("file", ".txt");
    textFile.deleteOnExit();

    final List<File> zipArchives = createFiles(".zip");
    zipArchives.add(textFile);
    final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar");

    final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
    final PigContext pigContext = pigServer.getPigContext();
    pigContext.connect();
    pigContext.getProperties().put("pig.streaming.ship.files",
            StringUtils.join(zipArchives, ","));
    pigContext.getProperties().put("pig.streaming.cache.files",
            StringUtils.join(tarArchives, ","));

    final JobConf jobConf = compileTestJob(pigContext, CONF);

    URI[] uris = DistributedCache.getCacheFiles(jobConf);
    int sizeTxt = 0;
    for (int i = 0; i < uris.length; i++) {
        if (uris[i].toString().endsWith(".txt")) {
            sizeTxt++;
        }
    }
    Assert.assertTrue(sizeTxt == 1);
    assertFilesInDistributedCache(
            DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz",
            ".tar.gz", ".tar");
}

Source File: TestJobControlCompiler.java From spork with Apache License 2.0

4 votes

/**
* Tests that no duplicate jars are added to distributed cache, which might cause conflicts
* and tests with both symlinked and normal jar specification
*/
 @Test
 public void testNoDuplicateJarsInDistributedCache() throws Exception {

     // JobControlCompiler setup
     final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
     PigContext pigContext = pigServer.getPigContext();
     pigContext.connect();

     Configuration conf = new Configuration();
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf));
     createAndAddResource("udf.jar", pigContext);
     createAndAddResource("udf1.jar", pigContext);
     createAndAddResource("udf2.jar", pigContext);
     createAndAddResource("another.jar", pigContext);

     final JobConf jobConf = compileTestJob(pigContext, conf);

     // verifying the jar gets on distributed cache
     URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf);
     Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
     // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice)
     // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar
     System.out.println("cache.files= " + Arrays.toString(cacheURIs));
     System.out.println("classpath.files= " + Arrays.toString(fileClassPaths));
     if (HadoopShims.isHadoopYARN()) {
         // Default jars - 5 (pig, antlr, joda-time, automaton)
         // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar
         Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     } else {
         // Default jars - 5. Has guava in addition
         // There will be same entries duplicated for udf.jar and udf2.jar
         Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     }

     // Count occurrences of the resources
     Map<String, Integer> occurrences = new HashMap<String, Integer>();

     for (URI cacheURI : cacheURIs) {
         Integer val = occurrences.get(cacheURI.toString());
         val = (val == null) ? 1 : ++val;
         occurrences.put(cacheURI.toString(), val);
     }
     if (HadoopShims.isHadoopYARN()) {
         Assert.assertEquals(9, occurrences.size());
     } else {
         Assert.assertEquals(10, occurrences.size()); //guava jar in addition
     }

     for (String file : occurrences.keySet()) {
         if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) {
             // Same path added twice which is ok. It should not be a shipped to hdfs temp path.
             // We assert path is same by checking count
             Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file));
         } else {
             // check that only single occurrence even though we added once to dist cache (simulating via Oozie)
             // and second time through pig register jar when there is symlink
             Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file));
         }
     }
 }

Source File: Submitter.java From RDFS with Apache License 2.0

4 votes

private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}

Source File: HadoopIOUtils.java From elasticsearch-hadoop with Apache License 2.0

4 votes

public static InputStream open(String resource, Configuration conf) {
    ClassLoader loader = conf.getClassLoader();

    if (loader == null) {
        loader = Thread.currentThread().getContextClassLoader();
    }

    if (loader == null) {
        loader = HadoopIOUtils.class.getClassLoader();
    }

    boolean trace = log.isTraceEnabled();

    try {
        // no prefix means classpath
        if (!resource.contains(":")) {

            InputStream result = loader.getResourceAsStream(resource);
            if (result != null) {
                if (trace) {
                    log.trace(String.format("Loaded resource %s from classpath", resource));
                }
                return result;
            }
            // fall back to the distributed cache
            URI[] uris = DistributedCache.getCacheFiles(conf);
            if (uris != null) {
                for (URI uri : uris) {
                    if (uri.toString().contains(resource)) {
                        if (trace) {
                            log.trace(String.format("Loaded resource %s from distributed cache", resource));
                        }
                        return uri.toURL().openStream();
                    }
                }
            }
        }

        // fall back to file system
        Path p = new Path(resource);
        FileSystem fs = p.getFileSystem(conf);
        return fs.open(p);
    } catch (IOException ex) {
        throw new EsHadoopIllegalArgumentException(String.format("Cannot open stream for resource %s", resource));
    }
}

Java Code Examples for org.apache.hadoop.filecache.DistributedCache#getCacheFiles()