Python Examples of pyspark.SparkConf

Source File: spark_process.py From dispel4py with Apache License 2.0

7 votes

def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args)

Source File: test_spark_model_export.py From mlflow with Apache License 2.0

6 votes

def spark_context():
    conf = pyspark.SparkConf()
    conf.set(key="spark.jars.packages",
             value='ml.combust.mleap:mleap-spark-base_2.11:0.12.0,'
                   'ml.combust.mleap:mleap-spark_2.11:0.12.0')
    max_tries = 3
    for num_tries in range(max_tries):
        try:
            spark = get_spark_session(conf)
            return spark.sparkContext
        except Exception as e:
            if num_tries >= max_tries - 1:
                raise
            _logger.exception(e, "Attempt %s to create a SparkSession failed, retrying..." %
                              num_tries)

Source File: build.py From sift with MIT License

6 votes

def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.')

Source File: taar_dynamo.py From telemetry-airflow with Mozilla Public License 2.0

6 votes

def main(date, aws_access_key_id, aws_secret_access_key, region, table, sample_rate):

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    APP_NAME = "TaarDynamo"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    date_obj = datetime.strptime(date, "%Y%m%d") - PATCH_DAYS

    reduction_output = run_etljob(
        spark,
        date_obj,
        region,
        table,
        sample_rate,
        aws_access_key_id,
        aws_secret_access_key,
    )
    pprint(reduction_output)

Source File: spark.py From pyFTS with GNU General Public License v3.0

6 votes

def create_spark_conf(**kwargs):
    """
    Configure the Spark master node

    :param kwargs:
    :return:
    """
    spark_executor_memory = kwargs.get("spark_executor_memory", "2g")
    spark_driver_memory = kwargs.get("spark_driver_memory", "2g")
    url = kwargs.get("url", SPARK_ADDR)
    app  = kwargs.get("app", 'pyFTS')

    conf = SparkConf()
    conf.setMaster(url)
    conf.setAppName(app)
    conf.set("spark.executor.memory", spark_executor_memory)
    conf.set("spark.driver.memory", spark_driver_memory)
    conf.set("spark.memory.offHeap.enabled",True)
    conf.set("spark.memory.offHeap.size","16g")
    
    return conf

Source File: config.py From pytest-spark with MIT License

6 votes

def initialize(cls, options_from_ini=None):
        if cls._instance:
            return cls._instance

        from pyspark import SparkConf

        cls._instance = SparkConf()

        cls.options = dict(cls.DEFAULTS)
        if options_from_ini:
            cls.options.update(cls._parse_config(options_from_ini))

        for k, v in cls.options.items():
            cls._instance.set(k, v)

        return cls._instance

Source File: testconfig.py From SMV with Apache License 2.0

6 votes

def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark

Source File: finance_similarity.py From Spark-in-Finance-Quantitative-Investing with Apache License 2.0

6 votes

def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc

Source File: test_ExtractCCLinks.py From cccatalog with MIT License

6 votes

def setUpClass(cls):
        #load sample warc files
        fh           = open('tests/sample_wat.paths')
        cls.watPaths = fh.readlines()

        #initialize class
        cls.cclinks = CCLinks('CC-MAIN-2018-13', 5)
        cls.cclinks.output = 'tests/output/{}/parquet'.format(cls.cclinks.crawlIndex)

        #remove output directory
        if os.path.exists(cls.cclinks.output):
            shutil.rmtree('tests/output')

        #init pyspark
        conf   = pyspark.SparkConf().setMaster('local[*]').setAppName('Test_ExtractCCLinks')
        cls.sc = pyspark.SparkContext.getOrCreate(conf=conf)

Source File: sparkcc.py From cc-pyspark with MIT License

6 votes

def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()

Source File: spark_conf.py From airflow-pipeline with Apache License 2.0

6 votes

def set_spark_defaults(conf, name='spark-job'):
    """
    Update the configuration dictionary for setting up spark, creating the
    dictionary if does not exist yet
    """
    if not conf:
        conf = dict()

    home = os.path.join('/tmp', str(uuid.uuid4()))
    conf['SparkConfiguration'] = SparkConf()\
        .setMaster('yarn-client')\
        .setAppName(name)\
        .set("spark.sql.shuffle.partitions", "1000")\
        .set("spark.scheduler.revive.interval", "3")\
        .set("spark.task.maxFailures", "0")\
        .set("spark.executorEnv.HOME", home)

    return conf

Source File: spark.py From qb with MIT License

6 votes

def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext:
    if QB_SPARK_MASTER != "":
        log.info("Spark master is %s" % QB_SPARK_MASTER)
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)\
            .setMaster(QB_SPARK_MASTER)
    else:
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)
    if configs is not None:
        for key, value in configs:
            if key in ('spark.executor.cores', 'spark.max.cores'):
                if value > QB_MAX_CORES:
                    log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of '
                             'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES))
                    value = QB_MAX_CORES
            spark_conf = spark_conf.set(key, value)
    return SparkContext.getOrCreate(spark_conf)

Source File: test_spark.py From mlflow with Apache License 2.0

5 votes

def spark():
    conf = pyspark.SparkConf()
    return get_spark_session(conf)

Source File: test_spark.py From mlflow with Apache License 2.0

5 votes

def score_model_as_udf(model_uri, pandas_df, result_type="double"):
    spark = get_spark_session(pyspark.SparkConf())
    spark_df = spark.createDataFrame(pandas_df)
    pyfunc_udf = spark_udf(spark=spark, model_uri=model_uri, result_type=result_type)
    new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
    return [x['prediction'] for x in new_df.collect()]

Source File: ozy_streaming.py From ozymandias with MIT License

5 votes

def main():
    """Run Spark Streaming"""
    conf = SparkConf()
    sc = SparkContext(appName='Ozymandias', conf=conf)
    sc.setLogLevel('WARN')
    
    with open(ROOT + 'channels.json', 'r') as f:
        channels = json.load(f)
    topics = [t['topic'] for t in channels['channels']]
    
    n_secs = 0.5
    ssc = StreamingContext(sc, n_secs)
    stream = KafkaUtils.createDirectStream(ssc, topics, {
                        'bootstrap.servers':'localhost:9092', 
                        'group.id':'ozy-group', 
                        'fetch.message.max.bytes':'15728640',
                        'auto.offset.reset':'largest'})
    
    stream.map(
            deserializer
        ).map(
            image_detector
        ).foreachRDD(
            message_sender)
    
    ssc.start()
    ssc.awaitTermination()

Source File: taar_dynamo.py From python_mozetl with MIT License

5 votes

def main(date, region, table, prod_iam_role, sample_rate):
    APP_NAME = "HBaseAddonRecommenderView"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    date_obj = datetime.strptime(date, "%Y%m%d")

    if prod_iam_role.strip() == "":
        prod_iam_role = None

    reduction_output = run_etljob(
        spark, date_obj, region, table, prod_iam_role, sample_rate
    )
    pprint(reduction_output)

Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0

5 votes

def _init_spark(self):
        """
        Set spark configuration

        :return: Spark session
        :return: Spark context
        """
        conf = SparkConf()

        # Link PG driver to Spark
        conf.set("spark.executor.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)
        conf.set("spark.driver.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)

        conf.set('spark.driver.memory', '20g')
        conf.set('spark.executor.memory', '20g')
        conf.set("spark.network.timeout", "6000")
        conf.set("spark.rpc.askTimeout", "99999")
        conf.set("spark.worker.timeout", "60000")
        conf.set("spark.driver.maxResultSize", '70g')
        conf.set("spark.ui.showConsoleProgress", "false")

        if self.spark_cluster:
            conf.set("spark.master", self.spark_cluster)

        # Gets Spark context
        sc = SparkContext(conf=conf)
        sc.setLogLevel("OFF")
        sql_ctxt = SQLContext(sc)
        return sql_ctxt.sparkSession, sql_ctxt

Source File: hyperparameters_tuning.py From intro_ds with Apache License 2.0

5 votes

def startSpark():
    """
    创建SparkContext，这是Spark程序的入口
    """
    conf = SparkConf().setAppName("grid search example")
    sc = SparkContext(conf=conf)
    return sc

Source File: reagent_sql_test_base.py From ReAgent with BSD 3-Clause "New" or "Revised" License

5 votes

def getConf(self):
        conf = SparkConf()
        for k, v in DEFAULT_SPARK_CONFIG.items():
            conf.set(k, v)
        return conf

Source File: ClimatologySpark2.py From incubator-sdap-nexus with Apache License 2.0

5 votes

def configureSpark(sparkConfig, appName, memoryPerExecutor='4G', coresPerExecutor=1):
    mode, numExecutors, numPartitions = sparkConfig.split(',')
    numExecutors = int(numExecutors)
    print >> sys.stderr, 'numExecutors = ', numExecutors
    numPartitions = int(numPartitions)
    print >> sys.stderr, 'numPartitions = ', numPartitions
    if mode == 'multicore':
        print >> sys.stderr, 'Using pysparkling'
        import pysparkling
        sc = pysparkling.Context()
    else:
        print >> sys.stderr, 'Using PySpark'
        sparkMaster = mode
        spConf = SparkConf()
        spConf.setAppName(appName)
        spConf.set("spark.executorEnv.HOME",
                   os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
        spConf.set("spark.executor.memory", memoryPerExecutor)
        print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
        try:
            sparkMaster = SparkMasterOverride
        except:
            pass
        if sparkMaster[:5] == "mesos":
            spConf.set("spark.cores.max", numExecutors)
        else:
            # Spark master is YARN or local[N]
            spConf.set("spark.executor.instances", numExecutors)
            spConf.set("spark.executor.cores", coresPerExecutor)
            spConf.setMaster(sparkMaster)
        sc = SparkContext(conf=spConf)
    return sc, numExecutors, numPartitions

Source File: test.py From TensorFlowOnSpark with Apache License 2.0

5 votes

def setUpClass(cls):
    master = os.getenv('MASTER')
    assert master is not None, "Please start a Spark standalone cluster and export MASTER to your env."

    num_workers = os.getenv('SPARK_WORKER_INSTANCES')
    assert num_workers is not None, "Please export SPARK_WORKER_INSTANCES to your env."
    cls.num_workers = int(num_workers)

    spark_jars = os.getenv('SPARK_CLASSPATH')
    assert spark_jars, "Please add path to tensorflow/ecosystem/hadoop jar to SPARK_CLASSPATH."

    cls.conf = SparkConf().set('spark.jars', spark_jars)
    cls.sc = SparkContext(master, cls.__name__, conf=cls.conf)
    cls.spark = SparkSession.builder.getOrCreate()

Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0

5 votes

def main(
    date,
    aws_access_key_id,
    aws_secret_access_key,
    bucket,
    prefix,
    elastic_net_param,
    reg_param,
    min_installed_addons,
    client_sample_date_from,
    sample_rate,
):
    print("Sampling clients since {}".format(client_sample_date_from))

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    ctx = default_context()

    APP_NAME = "TaarEnsemble"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    taar_training = extract(
        spark, client_sample_date_from, min_installed_addons, sample_rate
    )
    coefs = transform(ctx, spark, taar_training, reg_param, elastic_net_param)
    load(coefs, date, prefix, bucket)

Source File: tests.py From pyspark-cassandra with Apache License 2.0

5 votes

def setUpClass(cls):

        # connect to cassandra and create a keyspace for testing
        cls.session = Cluster().connect()
        cls.session.execute('''
            CREATE KEYSPACE IF NOT EXISTS %s WITH
            replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
        ''' % (cls.keyspace,))
        cls.session.set_keyspace(CassandraTestCase.keyspace)

        # create a cassandra spark context
        cls.sc = CassandraSparkContext(
            conf=SparkConf().setAppName("PySpark Cassandra Test"))

Source File: tasks.py From flask-spark-docker with MIT License

5 votes

def create_task(words):
    conf = SparkConf().setAppName('letter count')
    sc = SparkContext(conf=conf)
    seq = words.split()
    data = sc.parallelize(seq)
    counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect()
    sc.stop()
    return dict(counts)

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def getOrCreate(cls, checkpointPath, setupFunc):
        """
        Either recreate a StreamingContext from checkpoint data or create a new StreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        """
        cls._ensure_initialized()
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.StreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)
        if ssc_option.isEmpty():
            ssc = setupFunc()
            ssc.checkpoint(checkpointPath)
            return ssc

        jssc = gw.jvm.JavaStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return StreamingContext(sc, None, jssc)

Source File: launcher.py From spylon with BSD 3-Clause "New" or "Revised" License

5 votes

def spark_context(self, application_name):
        """Create a spark context given the parameters configured in this class.

        The caller is responsible for calling ``.close`` on the resulting spark context

        Parameters
        ----------
        application_name : string

        Returns
        -------
        sc : SparkContext
        """

        # initialize the spark configuration
        self._init_spark()
        import pyspark
        import pyspark.sql

        # initialize conf
        spark_conf = pyspark.SparkConf()
        for k, v in self._spark_conf_helper._conf_dict.items():
            spark_conf.set(k, v)

        log.info("Starting SparkContext")
        return pyspark.SparkContext(appName=application_name, conf=spark_conf)

Source File: conftest.py From elephas with MIT License

5 votes

def sql_context(request):
    """ fixture for creating a Spark SQLContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    sql_context = SQLContext(sc)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sql_context

Source File: conftest.py From elephas with MIT License

5 votes

def spark_context(request):
    """ fixture for creating a SparkContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sc

Source File: test_deeds.py From cccatalog with MIT License

5 votes

def spark_context(request):
     conf = (SparkConf()
            .setMaster("spark://ec2-54-167-211-230.compute-1.amazonaws.com:7077")
            .setAppName("commonsmapper-pyspark-local-testing")
            .set ("spark.jars", "../jars/hadoop-aws-2.8.1.jar,../jars/hadoop-auth-2.8.1.jar,../jars/aws-java-sdk-1.11.212.jar,../jars/postgresql-42.1.4.jar")
            .set ("spark.driver.extraClassPath", "../jars/")
            )
     sc = SparkContext(conf=conf)
     sc._jsc.hadoopConfiguration ().set("fs.s3n.awsAccessKeyId", os.environ ['OPEN_LEDGER_ACCESS_KEY_ID'])
     sc._jsc.hadoopConfiguration ().set("fs.s3n.awsSecretAccessKey", os.environ ['OPEN_LEDGER_SECRET_ACCESS_KEY'])
     request.addfinalizer(lambda: sc.stop())
     return sc

Source File: benchmark_spark.py From implicit with MIT License

5 votes

def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times

Python pyspark.SparkConf() Examples