org.apache.spark.api.java.JavaSparkContext#addFile

Source File: FileIO.java From RP-DBSCAN with Apache License 2.0

6 votes

public static List<String> broadCastData(JavaSparkContext sc, Configuration conf, String dirPath) throws IOException
{
	FileSystem fs = FileSystem.get(conf);
	FileStatus[] status = fs.listStatus(new Path(dirPath));
	List<String> metaPaths = new ArrayList<String>();

	long size = 0;
	
	for(int i=0; i<status.length; i++)
	{
		String path = status[i].getPath().toString();
		String fileName = status[i].getPath().getName();
		sc.addFile(path);
		metaPaths.add(fileName);
		
		size += status[i].getLen();
	}

	System.out.println("size : " + size);
	return metaPaths;
}

Source File: GATKSparkTool.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Register the reference file (and associated dictionary and index) to be downloaded to every node using Spark's
 * copying mechanism ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param referencePath the reference file, can be a local file or a remote path
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static String addReferenceFilesForSpark(JavaSparkContext ctx, Path referencePath) {
    if (referencePath == null) {
        return null;
    }
    Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath);
    Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath);
    Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath);

    ctx.addFile(referencePath.toUri().toString());
    if (Files.exists(indexPath)) {
        ctx.addFile(indexPath.toUri().toString());
    }
    if (Files.exists(dictPath)) {
        ctx.addFile(dictPath.toUri().toString());
    }
    if (Files.exists(gziPath)) {
        ctx.addFile(gziPath.toUri().toString());
    }

    return referencePath.getFileName().toString();
}

Source File: GATKSparkTool.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Register the VCF file (and associated index) to be downloaded to every node using Spark's copying mechanism
 * ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param vcfFileNames the VCF files, can be local files or remote paths
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static List<String> addVCFsForSpark(JavaSparkContext ctx, List<String> vcfFileNames) {
    for (String vcfFileName : vcfFileNames) {
        String vcfIndexFileName;
        if (vcfFileName.endsWith(FileExtensions.VCF)) {
            vcfIndexFileName = vcfFileName + FileExtensions.VCF_INDEX;
        } else if (vcfFileName.endsWith(FileExtensions.COMPRESSED_VCF)) {
            vcfIndexFileName = vcfFileName + FileExtensions.COMPRESSED_VCF_INDEX;
        } else {
            throw new IllegalArgumentException("Unrecognized known sites file extension. Must be .vcf or .vcf.gz");
        }
        ctx.addFile(vcfFileName);
        if (Files.exists(IOUtils.getPath(vcfIndexFileName))) {
            ctx.addFile(vcfIndexFileName);
        }
    }
    return vcfFileNames.stream().map(name -> IOUtils.getPath(name).getFileName().toString()).collect(Collectors.toList());
}

Source File: BwaSparkEngine.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * @param ctx           the Spark context
 * @param referenceFile the path to the reference file named <i>_prefix_.fa</i>, which is used to find the image file with name <i>_prefix_.fa.img</i>.
 *                      Can be <code>null</code> if the indexFileName is provided.
 * @param indexFileName the index image file name that already exists, or <code>null</code> to have the image file automatically distributed.
 * @param inputHeader   the SAM file header to use for reads
 * @param refDictionary the sequence dictionary to use for reads if the SAM file header doesn't have one (or it's empty)
 */
public BwaSparkEngine(final JavaSparkContext ctx,
                      final String referenceFile,
                      final String indexFileName,
                      SAMFileHeader inputHeader,
                      final SAMSequenceDictionary refDictionary) {
    Utils.nonNull(referenceFile);
    Utils.nonNull(inputHeader);
    this.ctx = ctx;
    if (indexFileName != null) {
        this.indexFileName = indexFileName;
        this.resolveIndexFileName = false;
    } else {
        String indexFile = referenceFile + REFERENCE_INDEX_IMAGE_FILE_SUFFIX;
        ctx.addFile(indexFile); // distribute index file to all executors
        this.indexFileName = IOUtils.getPath(indexFile).getFileName().toString();
        this.resolveIndexFileName = true;
    }

    if (inputHeader.getSequenceDictionary() == null || inputHeader.getSequenceDictionary().isEmpty()) {
        Utils.nonNull(refDictionary);
        inputHeader = inputHeader.clone();
        inputHeader.setSequenceDictionary(refDictionary);
    }
    broadcastHeader = ctx.broadcast(inputHeader);
}

Source File: SparkSegmentGenerationJobRunner.java From incubator-pinot with Apache License 2.0

6 votes

protected void packPluginsToDistributedCache(JavaSparkContext sparkContext) {
  String pluginsRootDir = PluginManager.get().getPluginsRootDir();
  if (pluginsRootDir == null) {
    LOGGER.warn("Local Pinot plugins directory is null, skip packaging...");
    return;
  }
  if (new File(pluginsRootDir).exists()) {
    File pluginsTarGzFile = new File(PINOT_PLUGINS_TAR_GZ);
    try {
      TarGzCompressionUtils.createTarGzOfDirectory(pluginsRootDir, pluginsTarGzFile.getPath());
    } catch (IOException e) {
      LOGGER.error("Failed to tar plugins directory", e);
    }
    sparkContext.addFile(pluginsTarGzFile.getAbsolutePath());
    String pluginsIncludes = System.getProperty(PLUGINS_INCLUDE_PROPERTY_NAME);
    if (pluginsIncludes != null) {
      sparkContext.getConf().set(PLUGINS_INCLUDE_PROPERTY_NAME, pluginsIncludes);
    }
  } else {
    LOGGER.warn("Cannot find local Pinot plugins directory at [{}]", pluginsRootDir);
  }
}

Source File: EntitySalienceTrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0

4 votes

@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                .set("spark.yarn.executor.memoryOverhead", "3072")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//.setMaster("local[4]"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(folds != null) {
            trainingSettings.setNumFolds(folds);
        }
        if(method != null) {
            trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.valueOf(method));
        }
        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }

        if(scalingFactor != null) {
            trainingSettings.setPositiveInstanceScalingFactor(scalingFactor);
        }

        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 4;
////        trainingSettings.setFeatureExtractor(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE);
////        trainingSettings.setAidaDefaultConf("db");
//        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);
//        trainingSettings.setPositiveInstanceScalingFactor(1);

        //Add the cache files to each node only if annotation is required.
        //The input documents could already be annotated, and in this case no caches are needed.
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        FileSystem fs = FileSystem.get(new Configuration());

        int partitionNumber = 3 * totalCores;
        if(partitions != null) {
            partitionNumber = partitions;
        }

        //Read training documents serialized as SCAS
        JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();

        //Train a model
        CrossValidatorModel model = trainingSparkRunner.crossValidate(sc, sqlContext, documents, trainingSettings);


        //Create the model path
        String modelPath = output+"/"+sc.getConf().getAppId()+"/model_"+trainingSettings.getClassificationMethod();

        //Delete the old model if there is one
        fs.delete(new Path(modelPath), true);

        //Save the new model model
        List<Model> models = new ArrayList<>();
        models.add(model.bestModel());
        sc.parallelize(models, 1).saveAsObjectFile(modelPath);

        //Save the model stats
        SparkClassificationModel.saveStats(model, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");


        return 0;
    }

Source File: EntitySalienceTestingSparkRunner.java From ambiverse-nlu with Apache License 2.0

4 votes

@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                //.set("spark.yarn.executor.memoryOverhead", "4096")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//setMaster("local"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }


        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 2;

        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);

        trainingSettings.setPositiveInstanceScalingFactor(1);
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        int partitionNumber = 3 * totalCores;
        //Read training documents serialized as SCAS
        JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber);

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();


        PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first();

        //Evaluate the model and write down the evaluation metrics.
        trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");

        return 0;
    }

Java Code Examples for org.apache.spark.api.java.JavaSparkContext#addFile()