Python Examples of pyspark.sql.SQLContext

Source File: sparkcc.py From cc-pyspark with MIT License

6 votes

def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop()

Source File: test_ExtractCCLinks.py From cccatalog with MIT License

6 votes

def summarizeOutput(self):
        s   = SQLContext(self.sc)
        res = s.read.parquet(self.cclinks.output)

        totalLinks  = res.count()
        uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count()
        uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count()


        res.registerTempTable('test_deeds')
        summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100')
        summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary'))

        fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w')
        fh.write('Total records: {}\r\n'.format(totalLinks))
        fh.write('Total unique content path: {}\r\n'.format(uniqueContent))
        fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery))
        fh.close()

Source File: drybell_spark.py From snorkel-tutorials with Apache License 2.0

6 votes

def main(data_path, output_path):
    # Read data
    logging.info(f"Reading data from {data_path}")
    sc = SparkContext()
    sql = SQLContext(sc)
    data = sql.read.parquet(data_path)

    # Build label matrix
    logging.info("Applying LFs")
    lfs = [article_mentions_person, body_contains_fortune, person_in_db]
    applier = SparkLFApplier(lfs)
    L = applier.apply(data.rdd)

    # Train label model
    logging.info("Training label model")
    label_model = LabelModel(cardinality=2)
    label_model.fit(L)

    # Generate training labels
    logging.info("Generating probabilistic labels")
    y_prob = label_model.predict_proba(L)[:, 1]
    y_prob_sql_array = F.array([F.lit(y) for y in y_prob])
    data_labeled = data.withColumn("y_prob", y_prob_sql_array)
    data_labeled.write.mode("overwrite").parquet(output_path)
    logging.info(f"Labels saved to {output_path}")

Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0

6 votes

def init_spark_session(app_name):
    """ Initializes a Spark Session with the given application name.

        Args:
            app_name (str): Name of the Spark application. This will also occur in the Spark UI.
    """
    global session, context, sql_context
    try:
        session = SparkSession \
                .builder \
                .appName(app_name) \
                .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
                .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
                .config("spark.driver.maxResultSize", "4g") \
                .getOrCreate()
        context = session.sparkContext
        context.setLogLevel("ERROR")
        sql_context = SQLContext(context)
    except Py4JJavaError as err:
        raise SparkSessionNotInitializedException(app_name, err.java_exception)

Source File: test_spark.py From snorkel with Apache License 2.0

6 votes

def test_lf_applier_spark_preprocessor_memoized(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)

        @preprocessor(memoize=True)
        def square_memoize(x: DataPoint) -> DataPoint:
            return Row(num=x.num, num_squared=x.num ** 2)

        @labeling_function(pre=[square_memoize])
        def fp_memoized(x: DataPoint) -> int:
            return 0 if x.num_squared > 42 else -1

        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp_memoized])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)

Source File: test_spark.py From snorkel with Apache License 2.0

5 votes

def test_lf_applier_spark_preprocessor(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, fp])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)

Source File: pyspark_task.py From luigi-sample with MIT License

5 votes

def main(self, sc, *args):
        logging.info("=======SPARK JOB=========")
        sqlContext = SQLContext(sc)

        df = (sqlContext.load(source="jdbc",
            url="jdbc:postgresql://localhost:2222/mydatabase?user=dbuser&password=dbpassword",
                dbtable="tablename"))

        print(df.show())
        logging.info(df.printSchema())

        with self.output().open('w') as outFile:
            outFile.write(str(result))

Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0

5 votes

def _init_spark(self):
        """
        Set spark configuration

        :return: Spark session
        :return: Spark context
        """
        conf = SparkConf()

        # Link PG driver to Spark
        conf.set("spark.executor.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)
        conf.set("spark.driver.extraClassPath",
                 self.holoclean_path + "/" + self.pg_driver)

        conf.set('spark.driver.memory', '20g')
        conf.set('spark.executor.memory', '20g')
        conf.set("spark.network.timeout", "6000")
        conf.set("spark.rpc.askTimeout", "99999")
        conf.set("spark.worker.timeout", "60000")
        conf.set("spark.driver.maxResultSize", '70g')
        conf.set("spark.ui.showConsoleProgress", "false")

        if self.spark_cluster:
            conf.set("spark.master", self.spark_cluster)

        # Gets Spark context
        sc = SparkContext(conf=conf)
        sc.setLogLevel("OFF")
        sql_ctxt = SQLContext(sc)
        return sql_ctxt.sparkSession, sql_ctxt

Source File: pcontext.py From sparklingpandas with Apache License 2.0

5 votes

def __init__(self, spark_context, sql_ctx=None):
        """Initialize a PSparkContext with the associacted spark context,
        and Spark SQL context if provided. This context is usef to load
        data into L{DataFrame}s.

        Parameters
        ----------
        spark_context: SparkContext
            Initialized and configured spark context. If you are running in the
            PySpark shell, this is already created as "sc".
        sql_ctx: SQLContext, optional
            Initialized and configured SQL context, if not provided Sparkling
            Panda's will create one.
        Returns
        -------
        Correctly initialized SparklingPandasContext.
        """
        self.spark_ctx = spark_context
        if sql_ctx:
            self.sql_ctx = sql_ctx
        else:
            logging.info("No sql context provided, creating")
            from pyspark.sql import SQLContext
            self.sql_ctx = SQLContext(self.spark_ctx)
        # Register our magical functions
        register_sql_extensions(self.sql_ctx)

Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def init_test_session(app_name):
    global session, context, sql_context
    try:
        session = SparkSession \
                .builder \
                .master('local') \
                .appName(app_name) \
                .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \
                .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \
                .getOrCreate()
        context = session.sparkContext
        context.setLogLevel("ERROR")
        sql_context = SQLContext(context)
    except Py4JJavaError as err:
        raise SparkSessionNotInitializedException(app_name, err.java_exception)

Source File: dataframe.py From LearningApacheSpark with MIT License

5 votes

def _test():
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext, SparkSession
    import pyspark.sql.dataframe
    from pyspark.sql.functions import from_unixtime
    globs = pyspark.sql.dataframe.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['spark'] = SparkSession(sc)
    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
        .toDF(StructType([StructField('age', IntegerType()),
                          StructField('name', StringType())]))
    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
    globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
                                   Row(name='Bob', age=5)]).toDF()
    globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
                                   Row(name='Bob', age=5, height=None),
                                   Row(name='Tom', age=None, height=None),
                                   Row(name=None, age=None, height=None)]).toDF()
    globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10),
                                   Row(name='Bob', spy=None, age=5),
                                   Row(name='Mallory', spy=True, age=None)]).toDF()
    globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846),
                                   Row(name='Bob', time=1479442946)]).toDF()

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.dataframe, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1)

Source File: recommendation.py From LearningApacheSpark with MIT License

5 votes

def _test():
    import doctest
    import pyspark.mllib.recommendation
    from pyspark.sql import SQLContext
    globs = pyspark.mllib.recommendation.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1)

Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License

5 votes

def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)

Source File: tests.py From spark-deep-learning with Apache License 2.0

5 votes

def setup_env(cls):
        cls.sc = SparkContext('local[*]', cls.__name__)
        cls.sql = SQLContext(cls.sc)
        cls.session = SparkSession.builder.getOrCreate()

Source File: test_spark.py From snorkel with Apache License 2.0

5 votes

def test_lf_applier_spark_fault(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, f_bad])
        with self.assertRaises(Exception):
            applier.apply(rdd)
        L = applier.apply(rdd, fault_tolerant=True)
        np.testing.assert_equal(L, L_EXPECTED_BAD)

Source File: test_spark.py From snorkel with Apache License 2.0

5 votes

def test_lf_applier_spark(self) -> None:
        sc = SparkContext.getOrCreate()
        sql = SQLContext(sc)
        df = pd.DataFrame(dict(num=DATA))
        rdd = sql.createDataFrame(df).rdd
        applier = SparkLFApplier([f, g])
        L = applier.apply(rdd)
        np.testing.assert_equal(L, L_EXPECTED)

Source File: conftest.py From elephas with MIT License

5 votes

def sql_context(request):
    """ fixture for creating a Spark SQLContext
    Args:
        request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local[2]").setAppName(
        "pytest-pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    sql_context = SQLContext(sc)
    request.addfinalizer(lambda: sc.stop())

    quiet_py4j()
    return sql_context

Source File: adapter.py From elephas with MIT License

5 votes

def to_data_frame(sc, features, labels, categorical=False):
    """Convert numpy arrays of features and labels into Spark DataFrame
    """
    lp_rdd = to_labeled_point(sc, features, labels, categorical)
    sql_context = SQLContext(sc)
    df = sql_context.createDataFrame(lp_rdd)
    return df

Source File: __init__.py From pyspark2pmml with GNU Affero General Public License v3.0

5 votes

def setUp(self):
		self.sc = SparkContext()
		self.sqlContext = SQLContext(self.sc)

Source File: test_deeds.py From cccatalog with MIT License

4 votes

def sql_context(request, spark_context):
    return SQLContext(spark_context)

Source File: streaming_context.py From monasca-analytics with Apache License 2.0

4 votes

def get_sqlcontext_instance(spark_context):
    """
    :type spark_context: pyspark.SparkContext
    :param spark_context: The currently active Spark Context
    :return: Returns the SQLContext
    :rtype: sql.SQLContext
    """
    if 'sqlContextSingletonInstance' not in globals():
        globals()['sqlContextSingletonInstance'] = sql.SQLContext(
            spark_context)
    return globals()['sqlContextSingletonInstance']

Python pyspark.sql.SQLContext() Examples