Python pyspark.sql.functions.desc() Examples

The following are 6 code examples of pyspark.sql.functions.desc(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: medline_spark.py    From pubmed_parser with MIT License 6 votes vote down vote up
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite') 
Example #2
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def getTopEntities(self, e, targetType = '', maxCount = 20, minScore = 0.0):
    df1 = self.df
    row1 = df1.where(df1.EntityId == e).first()
    self.raiseErrorIfNotFound(row1, e)

    if targetType == '':
      df2 = df1.where(df1.EntityId != e)
    else :
      df2 = df1.where((df1.EntityId != e) & (df1.EntityType == targetType))

    df3 = df2.select(df2.EntityId, df2.EntityType, udfCosineSimilarity(F.lit(row1.Data), df2.Data).alias('Score'))
    return df3.where(df3.Score >= minScore).orderBy(df3.Score.desc()).limit(maxCount)

# COMMAND ----------

# MAGIC %md **PaperSimilarity** class to compute paper recommendations

# COMMAND ----------

#   Parameters:
#     resource: resource stream path
#     container: container name in Azure Storage (AS) account
#     account: Azure Storage (AS) account
#     sas: complete 'Blob service SAS URL' of the shared access signature (sas) for the container
#     key: access key for the container, if sas is specified, key is ignored
#
#   Note:
#     resource does not have header
#     you need to provide value for either sas or key
# 
Example #3
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def _getTopEntitiesByEmbedding(self, e, maxCount, minScore):
    df1 = self.df
    paperdf = self.mag.getDataframe('Papers')
    row1 = df1.where(df1.EntityId == e).first()
    df2 = df1.where(df1.EntityId != e)
    df3 = df2.select(df2.EntityId, udfCosineSimilarityN(F.lit(row1.Data), df2.Data).alias('Score'))
    return df3.join(paperdf, df3.EntityId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df3.Score).where((~F.isnan(df3.Score)) & (df3.Score >= minScore)).orderBy(df3.Score.desc()).limit(maxCount) 
Example #4
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def _getTopEntitiesByCocitation(self, e, maxCount, minScore):
    df1 = self.corefdf
    paperdf = self.mag.getDataframe('Papers')
    df2 = df1.where(df1.ReferenceId == e)
    return df2.join(paperdf, df2.CoReferenceId == paperdf.PaperId, 'inner').select(paperdf.PaperId, paperdf.PaperTitle, df2.Score).where(df2.Score >= minScore).orderBy(df2.Score.desc()).limit(maxCount) 
Example #5
Source File: TutorialClasses.py    From KDD2019-HandsOn-Tutorial with MIT License 5 votes vote down vote up
def _compute_cocitations(self, coreferenceLimit=50):
    pr1 = self.mag.getDataframe('PaperReferences')
    pr1 = pr1.selectExpr("PaperId as PaperId1", "PaperReferenceId as PaperReferenceId1" )

    pr2 = self.mag.getDataframe('PaperReferences')
    pr2 = pr2.selectExpr("PaperId as PaperId2", "PaperReferenceId as PaperReferenceId2" )

    return pr1.join(pr2, pr1.PaperId1 == pr2.PaperId2, 'inner').filter(pr1.PaperReferenceId1 < pr2.PaperReferenceId2).select(pr1.PaperReferenceId1.alias('ReferenceId'), pr2.PaperReferenceId2.alias('CoReferenceId')).groupBy('ReferenceId', 'CoReferenceId').count().orderBy(F.desc('count')).selectExpr("ReferenceId as ReferenceId", "CoReferenceId as CoReferenceId", "count as Score").limit(coreferenceLimit)


# COMMAND ---------- 
Example #6
Source File: plot.py    From koalas with Apache License 2.0 5 votes vote down vote up
def _get_fliers(colname, outliers):
        # Filters only the outliers, should "showfliers" be True
        fliers_df = outliers.filter("__{}_outlier".format(colname))

        # If shows fliers, takes the top 1k with highest absolute values
        fliers = (
            fliers_df.select(F.abs(F.col("`{}`".format(colname))).alias(colname))
            .orderBy(F.desc("`{}`".format(colname)))
            .limit(1001)
            .toPandas()[colname]
            .values
        )

        return fliers