Python pandas.notnull() Examples

The following are 30 code examples of pandas.notnull(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: quality.py    From ssbio with MIT License 6 votes vote down vote up
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results 
Example #2
Source File: cbc_hb.py    From lifestyles with MIT License 6 votes vote down vote up
def _create_observation_variable(individual_selections, choices, partsworth):
    """
    This function handles creating the PyMC3 observation variables.  It also gracefully handles missing observations in individual selections.

    `individual_selections` is a Series of the individuals selections made, starting from 0. It can contain NaNs which represent answer was not provided.

    `choices` is a DataFrame with a hierarchical index: level=0 enumerates the choices, and level=1 displays the profile at a specific choice.
    It's size is (n_questions, n_choices_per_question).

    `partsworth` is a slice of PyMC3 matrix. It represents the partsworth variables of a individual. Size is (n_profiles,)

    This computes the values exp(partsworth * profile_j) / sum[ exp(partsworth * profile_k ] for all j.
    """
    nan_mask = pd.notnull(individual_selections)
    return pm.Categorical("Obs_%s" % individual_selections.name,
                          tt.nnet.softmax(tt.stack([
                            tt.dot(choice.values, partsworth) for _, choice in choices[nan_mask.values].groupby(axis=1, level=0)
                          ], axis=0).T),
                          observed=individual_selections[nan_mask.values].values) 
Example #3
Source File: utils.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def fillna(series_or_arr, missing_value=0.0):
    """Fill missing values in pandas objects and numpy arrays.

    Arguments
    ---------
    series_or_arr : pandas.Series, numpy.ndarray
        The numpy array or pandas series for which the missing values
        need to be replaced.
    missing_value : float, int, str
        The value to replace the missing value with. Default 0.0.

    Returns
    -------
    pandas.Series, numpy.ndarray
        The numpy array or pandas series with the missing values
        filled.
    """

    if pandas.notnull(missing_value):
        if isinstance(series_or_arr, (numpy.ndarray)):
            series_or_arr[numpy.isnan(series_or_arr)] = missing_value
        else:
            series_or_arr.fillna(missing_value, inplace=True)

    return series_or_arr 
Example #4
Source File: datasets.py    From deepchem with MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example #5
Source File: plotter.py    From pygraphistry with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _make_json_dataset(self, edges, nodes, name):
        (elist, nlist) = self._bind_attributes_v1(edges, nodes)
        edict = elist.where((pandas.notnull(elist)), None).to_dict(orient='records')

        bindings = {'idField': self._node or Plotter._defaultNodeId,
                    'destinationField': self._destination, 'sourceField': self._source}
        dataset = {'name': PyGraphistry._config['dataset_prefix'] + name,
                   'bindings': bindings, 'type': 'edgelist', 'graph': edict}

        if nlist is not None:
            ndict = nlist.where((pandas.notnull(nlist)), None).to_dict(orient='records')
            dataset['labels'] = ndict
        return dataset


    # Main helper for creating ETL2 payload 
Example #6
Source File: vgraph.py    From pygraphistry with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def objectEncoder(vg, series, dtype):
    series.where(pandas.notnull(series), '\0', inplace=True)
    # vec is a string[] submessage within a repeated
    vec = vg.string_vectors.add()
    str_series = None    
    try:
        str_series = series.astype('unicode')
    except UnicodeDecodeError:
        warnings.warn("Warning: escaping unicode")
        str_series = series.apply(lambda v: v.decode('utf-8'))
    for val in str_series:
        vec.values.append(val)
    return (vec, {'ctype': 'utf8'})


# NaN (as well as Infinity and undefined) are valid JSON. Use this guard to filter
# them out when creating the json metadata. 
Example #7
Source File: datasets.py    From PADME with MIT License 6 votes vote down vote up
def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk") 
Example #8
Source File: utils.py    From urbansprawl with MIT License 6 votes vote down vote up
def load_geodataframe(geo_filename):
	""" 
	Load input GeoDataFrame

	Parameters
	----------
	geo_filename : string
		input GeoDataFrame filename

	Returns
	----------
	geopandas.GeoDataFrame
		loaded data

	"""
	# Load using geopandas
	df_osm_data = gpd.read_file(geo_filename)
	# Set None as NaN
	df_osm_data.fillna(value=np.nan, inplace=True)
	# Replace empty string (Json NULL sometimes read as '') for NaN
	df_osm_data.replace('', np.nan, inplace=True)
	
	def list_int_from_string(x): # List of integers given input in string format
		return [ int(id_) for id_ in x.split(",") ]
	def list_str_from_string(x): # List of strings given input in string format
		return x.split(",")

	# Recover list
	if ( "activity_category" in df_osm_data.columns): 
		df_osm_data[ "activity_category" ] = df_osm_data.activity_category.apply(lambda x: list_str_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_parts" in df_osm_data.columns): 
		df_osm_data[ "containing_parts" ] = df_osm_data.containing_parts.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	if ( "containing_poi" in df_osm_data.columns): 
		df_osm_data[ "containing_poi" ] = df_osm_data.containing_poi.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan )
	
	# To UTM coordinates
	return ox.project_gdf( df_osm_data ) 
Example #9
Source File: generate_avro_file.py    From tfx with Apache License 2.0 6 votes vote down vote up
def generate_avro(src_file: Text, output_file: Text):
  """Generates avro file based on src file.

  Args:
    src_file: path to Chicago taxi dataset.
    output_file: output path for avro file.
  """
  df = pd.read_csv(src_file)
  # Replaces NaN's with None's for avroWriter to interpret null values
  df = df.where((pd.notnull(df)), None)

  records = df.to_dict(orient='records')

  parsed_schema = fastavro.parse_schema(get_schema())
  with open(output_file, 'wb') as f:
    fastavro.writer(f, parsed_schema, records) 
Example #10
Source File: test_mice.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_pertmeth(self):
        # Test with specified perturbation method.

        df = gendat()
        orig = df.copy()
        mx = pd.notnull(df)
        nrow, ncol = df.shape

        for pert_meth in "gaussian", "boot":

            imp_data = mice.MICEData(df, perturbation_method=pert_meth)

            for k in range(2):
                imp_data.update_all()
                assert_equal(imp_data.data.shape[0], nrow)
                assert_equal(imp_data.data.shape[1], ncol)
                assert_allclose(orig[mx], imp_data.data[mx])

        assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) 
Example #11
Source File: Trajectory.py    From TrajLib with Apache License 2.0 6 votes vote down vote up
def pre_processing(self, labels):
        # removing NaN in lat and lon
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        for label in labels:
            self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :]
        """
        lat_= self.raw_data.lat.rolling(3, min_periods=1).median()
        self.raw_data.assign(lat=lat_)
        lon_ = self.raw_data.lon.rolling(3, min_periods=1).median()
        self.raw_data.assign(lot=lon_)

        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        """

        return None 
Example #12
Source File: dataframe_utils.py    From fileflow with Apache License 2.0 6 votes vote down vote up
def clean_and_write_dataframe_to_csv(data, filename):
    """
    Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv

    :param data: data to write to CSV
    :type data: :class:`pandas.DataFrame`
    :param filename: Path to file to write CSV to. if None, string of data
        will be returned
    :type filename: str | None
    :return: If the filename is None, returns the string of data. Otherwise
        returns None.
    :rtype: str | None
    """
    # cleans np.NaN values
    data = data.where((pd.notnull(data)), None)
    # If filename=None, to_csv will return a string
    result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None,
                         skipinitialspace=True, quoting=csv.QUOTE_ALL)
    logging.info("Dataframe of shape %s has been stored." % str(data.shape))

    return result 
Example #13
Source File: test_logic.py    From ontask_b with MIT License 6 votes vote down vote up
def test_df_equivalent_after_sql(self):

        # Parse the CSV
        df_source = services.load_df_from_csvfile(
            io.StringIO(self.csv1),
            0,
            0)

        # Store the DF in the DB
        pandas.store_table(df_source, self.table_name)

        # Load it from the DB
        df_dst = pandas.load_table(self.table_name)

        # NaN in boolean columns are now None
        df_source['bool1'] = df_source['bool1'].where(
            pd.notnull(df_source['bool1']),
            None)
        df_source['bool2'] = df_source['bool2'].where(
            pd.notnull(df_source['bool2']),
            None)

        # Data frames mut be identical
        assert df_source.equals(df_dst) 
Example #14
Source File: movie_data.py    From parade with MIT License 6 votes vote down vote up
def execute_internal(self, context, **kwargs):
        """
        the internal execution process to be implemented
        :param context:
        :param kwargs:
        :return:
        """
        df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv')

        # Process projection on the dataset to get our interested attributes
        df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']]

        # Filter out records with *NAN* title_year and budget
        df = df[pd.notnull(df['title_year'])]
        df = df[df['budget'] > 0]

        # Extract the genres ROOT
        df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0])

        return df 
Example #15
Source File: uniprot.py    From ssbio with MIT License 6 votes vote down vote up
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id] 
Example #16
Source File: finta.py    From finta with GNU Lesser General Public License v3.0 5 votes vote down vote up
def KAMA(
        cls,
        ohlc: DataFrame,
        er: int = 10,
        ema_fast: int = 2,
        ema_slow: int = 30,
        period: int = 20,
    ) -> Series:
        """Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility.
        Its main advantage is that it takes into consideration not just the direction, but the market volatility as well."""

        er = cls.ER(ohlc, er)
        fast_alpha = 2 / (ema_fast + 1)
        slow_alpha = 2 / (ema_slow + 1)
        sc = pd.Series(
            (er * (fast_alpha - slow_alpha) + slow_alpha) ** 2,
            name="smoothing_constant",
        )  ## smoothing constant

        sma = pd.Series(
            ohlc["close"].rolling(period).mean(), name="SMA"
        )  ## first KAMA is SMA
        kama = []
        # Current KAMA = Prior KAMA + smoothing_constant * (Price - Prior KAMA)
        for s, ma, price in zip(
            sc.iteritems(), sma.shift().iteritems(), ohlc["close"].iteritems()
        ):
            try:
                kama.append(kama[-1] + s[1] * (price[1] - kama[-1]))
            except (IndexError, TypeError):
                if pd.notnull(ma[1]):
                    kama.append(ma[1] + s[1] * (price[1] - ma[1]))
                else:
                    kama.append(None)

        sma["KAMA"] = pd.Series(
            kama, index=sma.index, name="{0} period KAMA.".format(period)
        )  ## apply the kama list to existing index
        return sma["KAMA"] 
Example #17
Source File: dataframe_utils.py    From fileflow with Apache License 2.0 5 votes vote down vote up
def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'):
    """
    Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None

    :param str filename_or_stream: path to CSV
    :return:
    """
    # pulls data in as utf8, all as strings, and without pre whitespace padding
    try:
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            dtype=str,
            skipinitialspace=True
        )
    except AttributeError:
        # this is an empty dataframe and pandas crashed because it can't coerce the columns to strings
        # issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048
        # slated for 1.8 release
        # so for now just try loading the dataframe without specifying dtype
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            skipinitialspace=True
        )
    logging.info('File read via the pandas read_csv methodology.')

    # coerces pandas nulls (of np.NaN type) into python None
    data = data.where((pd.notnull(data)), None)

    # coerces string representations of Python None to a real Python None
    data[data == 'None'] = None
    data[data == ''] = None
    logging.info("Dataframe of shape %s has been retrieved." % str(data.shape))

    return data 
Example #18
Source File: Trajectory.py    From TrajLib with Apache License 2.0 5 votes vote down vote up
def load_data(self, **kwargs):
        # lat='lat',lon='lon',alt='alt',timeDate='timeDate',labels=['label1'],src='~/gps_fe/bigdata2_8696/ex_traj/5428_walk_790.csv',seperator=','
        print('loading...')
        lat = kwargs.get('lat', "lat")
        print(lat)
        lon = kwargs.get('lon', "lon")
        print(lon)
        alt = kwargs.get('alt', None)
        print(alt)
        time_date = kwargs.get('timeDate', "timeDate")

        print(time_date)
        labels = kwargs.get('labels', "[label]")
        print(labels)
        src = kwargs.get('src', "~/gps_fe/bigdata2_8696/ex_traj/5428_walk_790.csv")
        print(src)
        separator = kwargs.get('separator', ",")
        print(separator)

        self.labels = labels
        # input data needs lat,lon,alt,timeDate, [Labels]
        self.raw_data = pd.read_csv(src, sep=separator, parse_dates=[time_date], index_col=time_date)
        self.raw_data.rename(columns={lat: 'lat'}, inplace=True)
        self.raw_data.rename(columns={lon: 'lon'}, inplace=True)
        if alt is not None:
            self.raw_data.rename(columns={alt: 'alt'}, inplace=True)
        self.raw_data.rename(columns={time_date: 'timeDate'}, inplace=True)
        # preprocessing
        # removing NaN in lat and lon

        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :]
        self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :]
        for label in labels:
            self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :]

        print('Data loaded.')
        return self.raw_data 
Example #19
Source File: crypto_data.py    From investpy with MIT License 5 votes vote down vote up
def cryptos_as_list():
    """
    This function retrieves all the crypto coin names stored in `cryptos.csv` file, which contains all the
    data from the crypto coins as previously retrieved from Investing.com. So on, this function will just return
    the crypto coin names which will be the main input parameters when it comes to crypto data retrieval functions
    from investpy.

    Note that just some cryptos are available for retrieval, since Investing.com does not provide information
    from all the available ones, just the main ones.

    Returns:
        :obj:`list` - cryptos_list:
            The resulting :obj:`list` contains the all the available crypto coin names as indexed in Investing.com 
            from the information previously retrieved by investpy and stored on a csv file.

            In case the information was successfully retrieved, the :obj:`list` of crypto coin names will look like::

                cryptos_list = ['Bitcoin', 'Ethereum', 'XRP', 'Bitcoin Cash', 'Tether', 'Litecoin', ...]

    Raises:
        FileNotFoundError: raised if `cryptos.csv` file was not found.
        IOError: raised when `cryptos.csv` file is missing or empty.
    
    """

    resource_package = 'investpy'
    resource_path = '/'.join(('resources', 'crypto', 'cryptos.csv'))
    if pkg_resources.resource_exists(resource_package, resource_path):
        cryptos = pd.read_csv(pkg_resources.resource_filename(resource_package, resource_path))
    else:
        raise FileNotFoundError("ERR#0081: cryptos file not found or errored.")

    if cryptos is None:
        raise IOError("ERR#0082: cryptos not found or unable to retrieve.")

    cryptos = cryptos[cryptos['status'] == 'available']
    cryptos.drop(columns=['tag', 'id', 'status'], inplace=True)
    cryptos = cryptos.where(pd.notnull(cryptos), None)

    return cryptos['name'].tolist() 
Example #20
Source File: utils.py    From reportgen with MIT License 5 votes vote down vote up
def categorical_order(values, order=None):
    """Return a list of unique data values.

    Determine an ordered list of levels in ``values``.

    Parameters
    ----------
    values : list, array, Categorical, or Series
        Vector of "categorical" values
    order : list-like, optional
        Desired order of category levels to override the order determined
        from the ``values`` object.

    Returns
    -------
    order : list
        Ordered list of category levels not including null values.

    """
    if order is None:
        if hasattr(values, "categories"):
            order = values.categories
        else:
            try:
                order = values.cat.categories
            except (TypeError, AttributeError):
                try:
                    order = values.unique()
                except AttributeError:
                    order = pd.unique(values)
                try:
                    np.asarray(values).astype(np.float)
                    order = np.sort(order)
                except (ValueError, TypeError):
                    order = order
        order = filter(pd.notnull, order)
    return list(order) 
Example #21
Source File: solution.py    From Kaggle with MIT License 5 votes vote down vote up
def pre_processData(train_data,file_path):
    train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age)  # 为空的年龄补为平均年龄
    train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin不为空的设为yes
    train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'    
    '''0/1对应处理'''
    dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin')  # get_dummies返回对应的0/1格式的数据,有几类返回几列,prefix指定为Cabin
    dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
    dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
    dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
    train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1)  # 拼接dataframe,axis=1为列
    train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True)   # 删除之前没有处理的数据列            
    header_string = ','.join(train_data.columns.tolist())  # 将列名转为string,并用逗号隔开
    np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string)  # 预处理数据保存到指定目录下    
    '''均值归一化处理(Age和Fare)'''
    scaler = StandardScaler()
    age_scaler = scaler.fit(train_data['Age'])
    train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
    if np.sum(train_data.Fare.isnull()):  # 如果Fare中有为空的,就设为均值
        train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
    fare_scaler = scaler.fit(train_data['Fare'])
    train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
    header_string = ','.join(train_data.columns.tolist())  # 将列名转为string,并用逗号隔开
    np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string)  # 预处理数据保存到指定目录下    
    return train_data
    
    




## feature engineering:特征工程-预处理数据 
Example #22
Source File: utils_graph.py    From osmnx with MIT License 5 votes vote down vote up
def graph_from_gdfs(gdf_nodes, gdf_edges, graph_attrs=None):
    """
    Convert node and edge GeoDataFrames to a MultiDiGraph.

    This function is the inverse of `graph_to_gdfs`.

    Parameters
    ----------
    gdf_nodes : geopandas.GeoDataFrame
        GeoDataFrame of graph nodes
    gdf_edges : geopandas.GeoDataFrame
        GeoDataFrame of graph edges, must have crs attribute set
    graph_attrs : dict
        the new G.graph attribute dict; if None, add crs as the only
        graph-level attribute

    Returns
    -------
    G : networkx.MultiDiGraph
    """
    if graph_attrs is None:
        graph_attrs = {"crs": gdf_edges.crs}
    G = nx.MultiDiGraph(**graph_attrs)

    # add the nodes then each node's non-null attributes
    G.add_nodes_from(gdf_nodes.index)
    for col in gdf_nodes.columns:
        nx.set_node_attributes(G, name=col, values=gdf_nodes[col].dropna())

    # add each edge and its non-null attributes
    for (u, v, k), row in gdf_edges.set_index(["u", "v", "key"]).iterrows():
        d = {label: val for label, val in row.items() if isinstance(val, list) or pd.notnull(val)}
        G.add_edge(u, v, k, **d)

    utils.log("Created graph from node/edge GeoDataFrames")
    return G 
Example #23
Source File: history_container.py    From zipline-chinese with Apache License 2.0 5 votes vote down vote up
def update_last_known_values(self):
        """
        Store the non-NaN values from our oldest frame in each frequency.
        """
        ffillable = self.ffillable_fields
        if not len(ffillable):
            return

        for frequency in self.unique_frequencies:
            digest_panel = self.digest_panels.get(frequency, None)
            if digest_panel:
                oldest_known_values = digest_panel.oldest_frame(raw=True)
            else:
                oldest_known_values = self.buffer_panel.oldest_frame(raw=True)

            oldest_vals = oldest_known_values
            oldest_columns = self.fields
            for field in ffillable:
                f_idx = oldest_columns.get_loc(field)
                field_vals = oldest_vals[f_idx]
                # isnan would be fast, possible to use?
                non_nan_sids = np.where(pd.notnull(field_vals))
                key = (frequency.freq_str, field)
                key_loc = self.last_known_prior_values.index.get_loc(key)
                self.last_known_prior_values.values[
                    key_loc, non_nan_sids
                ] = field_vals[non_nan_sids] 
Example #24
Source File: label_encoder_for_multi_fit.py    From KDD-Cup-AutoML-5 with MIT License 5 votes vote down vote up
def fit(self, data):
        assert isinstance(data, pd.Series)
        data = data[pd.notnull(data)]
        self.labels = self.labels | set(data.tolist()) 
Example #25
Source File: intent_classifier.py    From SKF-Chatbot with GNU Affero General Public License v3.0 5 votes vote down vote up
def data_prepare():
    col = ['classs', 'question']
    y=get_data()
    y = y[col]
    y = y[pd.notnull(y['question'])]
    y.columns = ['classs', 'question']
    y['category_id'] = y['classs'].factorize()[0]
    category_id_df = y[['classs', 'category_id']].drop_duplicates().sort_values('category_id')
    category_to_id = dict(category_id_df.values)
    id_to_category = dict(category_id_df[['category_id', 'classs']].values)
    return y 
Example #26
Source File: transform.py    From marcotti with MIT License 5 votes vote down vote up
def venues(self, data_frame):
        lambdafunc = lambda x: pd.Series([
            self.get_id(mco.Countries, name=x['country']),
            self.get_id(mco.Timezones, name=x['timezone']),
            self.get_id(mco.Surfaces, description=x['surface']),
            self.make_date_object(x['config_date'])
        ])
        ids_frame = data_frame.apply(lambdafunc, axis=1)
        ids_frame.columns = ['country_id', 'timezone_id', 'surface_id', 'eff_date']
        joined_frame = data_frame.join(ids_frame).drop(['country', 'timezone', 'surface', 'config_date'], axis=1)
        new_frame = joined_frame.where((pd.notnull(joined_frame)), None)
        return new_frame 
Example #27
Source File: test_data_cleaner.py    From data-cleaner with MIT License 5 votes vote down vote up
def nan_to_empty_string_list(iterable):
    """Retorna una lista convirtiendo valores nulos a None."""
    return [i if pd.notnull(i) else "" for i in iterable] 
Example #28
Source File: apply_matcher.py    From py_stringsimjoin with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def generate_tokens(table, key_attr, join_attr, tokenizer):
    table_nonnull = table[pd.notnull(table[join_attr])]
    return dict(zip(table_nonnull[key_attr],
                    table_nonnull[join_attr].apply(tokenizer.tokenize))) 
Example #29
Source File: data_cleaner.py    From data-cleaner with MIT License 5 votes vote down vote up
def _split(value, separators):
        values = []
        for separator in separators:
            if separator in str(value):
                values = [str(split_value) for split_value in
                          value.split(separator)]
                break

        return pd.Series([str(value).strip() for value in values
                          if pd.notnull(value)]) 
Example #30
Source File: merge_pbp_shifts.py    From Hockey-Scraper with GNU General Public License v3.0 5 votes vote down vote up
def merge(pbp_df, shifts_df):
    """
    Merge the shifts_df into the pbp_df.

    :param pbp_df: Play by Play DataFrame
    :param shifts_df: Shift Tables DataFrame

    :return: Play by Play DataFrame with shift info embedded
    """
    # To get the final pbp columns in the "correct" order
    pbp_columns = pbp_df.columns

    shifts_df['Player_Id'] = shifts_df['Player_Id'].astype(int)

    # Get unique game_id -> teams pair for placing in Shifts_df
    pbp_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Home_Team', 'Away_Team'])[['Game_Id', 'Home_Team', 'Away_Team']]

    # Group up shifts that start/end at the same time
    new_shifts = group_shifts(pbp_unique, shifts_df)
    new_shifts = new_shifts.where((pd.notnull(new_shifts)), None)

    # Add in & order rows
    new_pbp = pbp_df.append(new_shifts).reset_index(drop=True)
    new_pbp['Priority'] = new_pbp.apply(label_priority, axis=1)
    new_pbp = new_pbp.sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Priority'])

    return new_pbp[pbp_columns]