Python pyspark.sql.SQLContext() Examples
The following are 21
code examples of pyspark.sql.SQLContext().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql
, or try the search function
.
Example #1
Source File: sparkcc.py From cc-pyspark with MIT License | 6 votes |
def run(self): self.args = self.parse_arguments() conf = SparkConf() if self.args.spark_profiler: conf = conf.set("spark.python.profile", "true") sc = SparkContext( appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.init_accumulators(sc) self.run_job(sc, sqlc) if self.args.spark_profiler: sc.show_profiles() sc.stop()
Example #2
Source File: test_ExtractCCLinks.py From cccatalog with MIT License | 6 votes |
def summarizeOutput(self): s = SQLContext(self.sc) res = s.read.parquet(self.cclinks.output) totalLinks = res.count() uniqueContentQuery = res.drop_duplicates(subset=['provider_domain', 'content_path', 'content_query_string']).count() uniqueContent = res.drop_duplicates(subset=['provider_domain', 'content_path']).count() res.registerTempTable('test_deeds') summary = s.sql('SELECT provider_domain, count(*) AS total, count(distinct content_path) AS unique_content_path, count(distinct content_query_string) AS unique_query_string FROM test_deeds GROUP BY provider_domain ORDER BY total DESC LIMIT 100') summary.write.mode('overwrite').format('csv').option('header', 'true').save(self.cclinks.output.replace('parquet', 'summary')) fh = open('{}/total'.format(self.cclinks.output.replace('parquet', 'summary')), 'w') fh.write('Total records: {}\r\n'.format(totalLinks)) fh.write('Total unique content path: {}\r\n'.format(uniqueContent)) fh.write('Total unique query strings: {}\r\n'.format(uniqueContentQuery)) fh.close()
Example #3
Source File: drybell_spark.py From snorkel-tutorials with Apache License 2.0 | 6 votes |
def main(data_path, output_path): # Read data logging.info(f"Reading data from {data_path}") sc = SparkContext() sql = SQLContext(sc) data = sql.read.parquet(data_path) # Build label matrix logging.info("Applying LFs") lfs = [article_mentions_person, body_contains_fortune, person_in_db] applier = SparkLFApplier(lfs) L = applier.apply(data.rdd) # Train label model logging.info("Training label model") label_model = LabelModel(cardinality=2) label_model.fit(L) # Generate training labels logging.info("Generating probabilistic labels") y_prob = label_model.predict_proba(L)[:, 1] y_prob_sql_array = F.array([F.lit(y) for y in y_prob]) data_labeled = data.withColumn("y_prob", y_prob_sql_array) data_labeled.write.mode("overwrite").parquet(output_path) logging.info(f"Labels saved to {output_path}")
Example #4
Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0 | 6 votes |
def init_spark_session(app_name): """ Initializes a Spark Session with the given application name. Args: app_name (str): Name of the Spark application. This will also occur in the Spark UI. """ global session, context, sql_context try: session = SparkSession \ .builder \ .appName(app_name) \ .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \ .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \ .config("spark.driver.maxResultSize", "4g") \ .getOrCreate() context = session.sparkContext context.setLogLevel("ERROR") sql_context = SQLContext(context) except Py4JJavaError as err: raise SparkSessionNotInitializedException(app_name, err.java_exception)
Example #5
Source File: test_spark.py From snorkel with Apache License 2.0 | 6 votes |
def test_lf_applier_spark_preprocessor_memoized(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) @preprocessor(memoize=True) def square_memoize(x: DataPoint) -> DataPoint: return Row(num=x.num, num_squared=x.num ** 2) @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -1 df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, fp_memoized]) L = applier.apply(rdd) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
Example #6
Source File: test_spark.py From snorkel with Apache License 2.0 | 5 votes |
def test_lf_applier_spark_preprocessor(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, fp]) L = applier.apply(rdd) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
Example #7
Source File: pyspark_task.py From luigi-sample with MIT License | 5 votes |
def main(self, sc, *args): logging.info("=======SPARK JOB=========") sqlContext = SQLContext(sc) df = (sqlContext.load(source="jdbc", url="jdbc:postgresql://localhost:2222/mydatabase?user=dbuser&password=dbpassword", dbtable="tablename")) print(df.show()) logging.info(df.printSchema()) with self.output().open('w') as outFile: outFile.write(str(result))
Example #8
Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def _init_spark(self): """ Set spark configuration :return: Spark session :return: Spark context """ conf = SparkConf() # Link PG driver to Spark conf.set("spark.executor.extraClassPath", self.holoclean_path + "/" + self.pg_driver) conf.set("spark.driver.extraClassPath", self.holoclean_path + "/" + self.pg_driver) conf.set('spark.driver.memory', '20g') conf.set('spark.executor.memory', '20g') conf.set("spark.network.timeout", "6000") conf.set("spark.rpc.askTimeout", "99999") conf.set("spark.worker.timeout", "60000") conf.set("spark.driver.maxResultSize", '70g') conf.set("spark.ui.showConsoleProgress", "false") if self.spark_cluster: conf.set("spark.master", self.spark_cluster) # Gets Spark context sc = SparkContext(conf=conf) sc.setLogLevel("OFF") sql_ctxt = SQLContext(sc) return sql_ctxt.sparkSession, sql_ctxt
Example #9
Source File: pcontext.py From sparklingpandas with Apache License 2.0 | 5 votes |
def __init__(self, spark_context, sql_ctx=None): """Initialize a PSparkContext with the associacted spark context, and Spark SQL context if provided. This context is usef to load data into L{DataFrame}s. Parameters ---------- spark_context: SparkContext Initialized and configured spark context. If you are running in the PySpark shell, this is already created as "sc". sql_ctx: SQLContext, optional Initialized and configured SQL context, if not provided Sparkling Panda's will create one. Returns ------- Correctly initialized SparklingPandasContext. """ self.spark_ctx = spark_context if sql_ctx: self.sql_ctx = sql_ctx else: logging.info("No sql context provided, creating") from pyspark.sql import SQLContext self.sql_ctx = SQLContext(self.spark_ctx) # Register our magical functions register_sql_extensions(self.sql_ctx)
Example #10
Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def init_test_session(app_name): global session, context, sql_context try: session = SparkSession \ .builder \ .master('local') \ .appName(app_name) \ .config("spark.hadoop.dfs.client.use.datanode.hostname", "true") \ .config("spark.hadoop.dfs.datanode.use.datanode.hostname", "true") \ .getOrCreate() context = session.sparkContext context.setLogLevel("ERROR") sql_context = SQLContext(context) except Py4JJavaError as err: raise SparkSessionNotInitializedException(app_name, err.java_exception)
Example #11
Source File: dataframe.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext, SparkSession import pyspark.sql.dataframe from pyspark.sql.functions import from_unixtime globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['spark'] = SparkSession(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() globs['df3'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80), Row(name='Bob', age=5, height=None), Row(name='Tom', age=None, height=None), Row(name=None, age=None, height=None)]).toDF() globs['df5'] = sc.parallelize([Row(name='Alice', spy=False, age=10), Row(name='Bob', spy=None, age=5), Row(name='Mallory', spy=True, age=None)]).toDF() globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846), Row(name='Bob', time=1479442946)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: sys.exit(-1)
Example #12
Source File: recommendation.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import doctest import pyspark.mllib.recommendation from pyspark.sql import SQLContext globs = pyspark.mllib.recommendation.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: sys.exit(-1)
Example #13
Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License | 5 votes |
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
Example #14
Source File: tests.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def setup_env(cls): cls.sc = SparkContext('local[*]', cls.__name__) cls.sql = SQLContext(cls.sc) cls.session = SparkSession.builder.getOrCreate()
Example #15
Source File: test_spark.py From snorkel with Apache License 2.0 | 5 votes |
def test_lf_applier_spark_fault(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, f_bad]) with self.assertRaises(Exception): applier.apply(rdd) L = applier.apply(rdd, fault_tolerant=True) np.testing.assert_equal(L, L_EXPECTED_BAD)
Example #16
Source File: test_spark.py From snorkel with Apache License 2.0 | 5 votes |
def test_lf_applier_spark(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, g]) L = applier.apply(rdd) np.testing.assert_equal(L, L_EXPECTED)
Example #17
Source File: conftest.py From elephas with MIT License | 5 votes |
def sql_context(request): """ fixture for creating a Spark SQLContext Args: request: pytest.FixtureRequest object """ conf = (SparkConf().setMaster("local[2]").setAppName( "pytest-pyspark-local-testing")) sc = SparkContext(conf=conf) sql_context = SQLContext(sc) request.addfinalizer(lambda: sc.stop()) quiet_py4j() return sql_context
Example #18
Source File: adapter.py From elephas with MIT License | 5 votes |
def to_data_frame(sc, features, labels, categorical=False): """Convert numpy arrays of features and labels into Spark DataFrame """ lp_rdd = to_labeled_point(sc, features, labels, categorical) sql_context = SQLContext(sc) df = sql_context.createDataFrame(lp_rdd) return df
Example #19
Source File: __init__.py From pyspark2pmml with GNU Affero General Public License v3.0 | 5 votes |
def setUp(self): self.sc = SparkContext() self.sqlContext = SQLContext(self.sc)
Example #20
Source File: test_deeds.py From cccatalog with MIT License | 4 votes |
def sql_context(request, spark_context): return SQLContext(spark_context)
Example #21
Source File: streaming_context.py From monasca-analytics with Apache License 2.0 | 4 votes |
def get_sqlcontext_instance(spark_context): """ :type spark_context: pyspark.SparkContext :param spark_context: The currently active Spark Context :return: Returns the SQLContext :rtype: sql.SQLContext """ if 'sqlContextSingletonInstance' not in globals(): globals()['sqlContextSingletonInstance'] = sql.SQLContext( spark_context) return globals()['sqlContextSingletonInstance']