Python pandas.notnull() Examples
The following are 30
code examples of pandas.notnull().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: quality.py From ssbio with MIT License | 6 votes |
def parse_psqs(psqs_results_file): """Parse a PSQS result file and returns a Pandas DataFrame of the results Args: psqs_results_file: Path to psqs results file Returns: Pandas DataFrame: Summary of PSQS results """ # TODO: generalize column names for all results, save as dict instead psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None) psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb')) psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1) psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan) psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan) psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)] return psqs_results
Example #2
Source File: cbc_hb.py From lifestyles with MIT License | 6 votes |
def _create_observation_variable(individual_selections, choices, partsworth): """ This function handles creating the PyMC3 observation variables. It also gracefully handles missing observations in individual selections. `individual_selections` is a Series of the individuals selections made, starting from 0. It can contain NaNs which represent answer was not provided. `choices` is a DataFrame with a hierarchical index: level=0 enumerates the choices, and level=1 displays the profile at a specific choice. It's size is (n_questions, n_choices_per_question). `partsworth` is a slice of PyMC3 matrix. It represents the partsworth variables of a individual. Size is (n_profiles,) This computes the values exp(partsworth * profile_j) / sum[ exp(partsworth * profile_k ] for all j. """ nan_mask = pd.notnull(individual_selections) return pm.Categorical("Obs_%s" % individual_selections.name, tt.nnet.softmax(tt.stack([ tt.dot(choice.values, partsworth) for _, choice in choices[nan_mask.values].groupby(axis=1, level=0) ], axis=0).T), observed=individual_selections[nan_mask.values].values)
Example #3
Source File: utils.py From recordlinkage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fillna(series_or_arr, missing_value=0.0): """Fill missing values in pandas objects and numpy arrays. Arguments --------- series_or_arr : pandas.Series, numpy.ndarray The numpy array or pandas series for which the missing values need to be replaced. missing_value : float, int, str The value to replace the missing value with. Default 0.0. Returns ------- pandas.Series, numpy.ndarray The numpy array or pandas series with the missing values filled. """ if pandas.notnull(missing_value): if isinstance(series_or_arr, (numpy.ndarray)): series_or_arr[numpy.isnan(series_or_arr)] = missing_value else: series_or_arr.fillna(missing_value, inplace=True) return series_or_arr
Example #4
Source File: datasets.py From deepchem with MIT License | 6 votes |
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
Example #5
Source File: plotter.py From pygraphistry with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _make_json_dataset(self, edges, nodes, name): (elist, nlist) = self._bind_attributes_v1(edges, nodes) edict = elist.where((pandas.notnull(elist)), None).to_dict(orient='records') bindings = {'idField': self._node or Plotter._defaultNodeId, 'destinationField': self._destination, 'sourceField': self._source} dataset = {'name': PyGraphistry._config['dataset_prefix'] + name, 'bindings': bindings, 'type': 'edgelist', 'graph': edict} if nlist is not None: ndict = nlist.where((pandas.notnull(nlist)), None).to_dict(orient='records') dataset['labels'] = ndict return dataset # Main helper for creating ETL2 payload
Example #6
Source File: vgraph.py From pygraphistry with BSD 3-Clause "New" or "Revised" License | 6 votes |
def objectEncoder(vg, series, dtype): series.where(pandas.notnull(series), '\0', inplace=True) # vec is a string[] submessage within a repeated vec = vg.string_vectors.add() str_series = None try: str_series = series.astype('unicode') except UnicodeDecodeError: warnings.warn("Warning: escaping unicode") str_series = series.apply(lambda v: v.decode('utf-8')) for val in str_series: vec.values.append(val) return (vec, {'ctype': 'utf8'}) # NaN (as well as Infinity and undefined) are valid JSON. Use this guard to filter # them out when creating the json metadata.
Example #7
Source File: datasets.py From PADME with MIT License | 6 votes |
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
Example #8
Source File: utils.py From urbansprawl with MIT License | 6 votes |
def load_geodataframe(geo_filename): """ Load input GeoDataFrame Parameters ---------- geo_filename : string input GeoDataFrame filename Returns ---------- geopandas.GeoDataFrame loaded data """ # Load using geopandas df_osm_data = gpd.read_file(geo_filename) # Set None as NaN df_osm_data.fillna(value=np.nan, inplace=True) # Replace empty string (Json NULL sometimes read as '') for NaN df_osm_data.replace('', np.nan, inplace=True) def list_int_from_string(x): # List of integers given input in string format return [ int(id_) for id_ in x.split(",") ] def list_str_from_string(x): # List of strings given input in string format return x.split(",") # Recover list if ( "activity_category" in df_osm_data.columns): df_osm_data[ "activity_category" ] = df_osm_data.activity_category.apply(lambda x: list_str_from_string(x) if pd.notnull(x) else np.nan ) if ( "containing_parts" in df_osm_data.columns): df_osm_data[ "containing_parts" ] = df_osm_data.containing_parts.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan ) if ( "containing_poi" in df_osm_data.columns): df_osm_data[ "containing_poi" ] = df_osm_data.containing_poi.apply( lambda x: list_int_from_string(x) if pd.notnull(x) else np.nan ) # To UTM coordinates return ox.project_gdf( df_osm_data )
Example #9
Source File: generate_avro_file.py From tfx with Apache License 2.0 | 6 votes |
def generate_avro(src_file: Text, output_file: Text): """Generates avro file based on src file. Args: src_file: path to Chicago taxi dataset. output_file: output path for avro file. """ df = pd.read_csv(src_file) # Replaces NaN's with None's for avroWriter to interpret null values df = df.where((pd.notnull(df)), None) records = df.to_dict(orient='records') parsed_schema = fastavro.parse_schema(get_schema()) with open(output_file, 'wb') as f: fastavro.writer(f, parsed_schema, records)
Example #10
Source File: test_mice.py From vnpy_crypto with MIT License | 6 votes |
def test_pertmeth(self): # Test with specified perturbation method. df = gendat() orig = df.copy() mx = pd.notnull(df) nrow, ncol = df.shape for pert_meth in "gaussian", "boot": imp_data = mice.MICEData(df, perturbation_method=pert_meth) for k in range(2): imp_data.update_all() assert_equal(imp_data.data.shape[0], nrow) assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1'])
Example #11
Source File: Trajectory.py From TrajLib with Apache License 2.0 | 6 votes |
def pre_processing(self, labels): # removing NaN in lat and lon self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :] self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :] for label in labels: self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :] """ lat_= self.raw_data.lat.rolling(3, min_periods=1).median() self.raw_data.assign(lat=lat_) lon_ = self.raw_data.lon.rolling(3, min_periods=1).median() self.raw_data.assign(lot=lon_) self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :] self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :] """ return None
Example #12
Source File: dataframe_utils.py From fileflow with Apache License 2.0 | 6 votes |
def clean_and_write_dataframe_to_csv(data, filename): """ Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv :param data: data to write to CSV :type data: :class:`pandas.DataFrame` :param filename: Path to file to write CSV to. if None, string of data will be returned :type filename: str | None :return: If the filename is None, returns the string of data. Otherwise returns None. :rtype: str | None """ # cleans np.NaN values data = data.where((pd.notnull(data)), None) # If filename=None, to_csv will return a string result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None, skipinitialspace=True, quoting=csv.QUOTE_ALL) logging.info("Dataframe of shape %s has been stored." % str(data.shape)) return result
Example #13
Source File: test_logic.py From ontask_b with MIT License | 6 votes |
def test_df_equivalent_after_sql(self): # Parse the CSV df_source = services.load_df_from_csvfile( io.StringIO(self.csv1), 0, 0) # Store the DF in the DB pandas.store_table(df_source, self.table_name) # Load it from the DB df_dst = pandas.load_table(self.table_name) # NaN in boolean columns are now None df_source['bool1'] = df_source['bool1'].where( pd.notnull(df_source['bool1']), None) df_source['bool2'] = df_source['bool2'].where( pd.notnull(df_source['bool2']), None) # Data frames mut be identical assert df_source.equals(df_dst)
Example #14
Source File: movie_data.py From parade with MIT License | 6 votes |
def execute_internal(self, context, **kwargs): """ the internal execution process to be implemented :param context: :param kwargs: :return: """ df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv') # Process projection on the dataset to get our interested attributes df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']] # Filter out records with *NAN* title_year and budget df = df[pd.notnull(df['title_year'])] df = df[df['budget'] > 0] # Extract the genres ROOT df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0]) return df
Example #15
Source File: uniprot.py From ssbio with MIT License | 6 votes |
def uniprot_reviewed_checker(uniprot_id): """Check if a single UniProt ID is reviewed or not. Args: uniprot_id: Returns: bool: If the entry is reviewed """ query_string = 'id:' + uniprot_id uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab')) uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0) uni_rev_df = uni_rev_df.fillna(False) uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)] uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True) uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False) uni_rev_dict_adder = uni_rev_df.to_dict()['Status'] return uni_rev_dict_adder[uniprot_id]
Example #16
Source File: finta.py From finta with GNU Lesser General Public License v3.0 | 5 votes |
def KAMA( cls, ohlc: DataFrame, er: int = 10, ema_fast: int = 2, ema_slow: int = 30, period: int = 20, ) -> Series: """Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility. Its main advantage is that it takes into consideration not just the direction, but the market volatility as well.""" er = cls.ER(ohlc, er) fast_alpha = 2 / (ema_fast + 1) slow_alpha = 2 / (ema_slow + 1) sc = pd.Series( (er * (fast_alpha - slow_alpha) + slow_alpha) ** 2, name="smoothing_constant", ) ## smoothing constant sma = pd.Series( ohlc["close"].rolling(period).mean(), name="SMA" ) ## first KAMA is SMA kama = [] # Current KAMA = Prior KAMA + smoothing_constant * (Price - Prior KAMA) for s, ma, price in zip( sc.iteritems(), sma.shift().iteritems(), ohlc["close"].iteritems() ): try: kama.append(kama[-1] + s[1] * (price[1] - kama[-1])) except (IndexError, TypeError): if pd.notnull(ma[1]): kama.append(ma[1] + s[1] * (price[1] - ma[1])) else: kama.append(None) sma["KAMA"] = pd.Series( kama, index=sma.index, name="{0} period KAMA.".format(period) ) ## apply the kama list to existing index return sma["KAMA"]
Example #17
Source File: dataframe_utils.py From fileflow with Apache License 2.0 | 5 votes |
def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'): """ Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None :param str filename_or_stream: path to CSV :return: """ # pulls data in as utf8, all as strings, and without pre whitespace padding try: data = pd.read_csv( filepath_or_buffer=filename_or_stream, encoding=encoding, dtype=str, skipinitialspace=True ) except AttributeError: # this is an empty dataframe and pandas crashed because it can't coerce the columns to strings # issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048 # slated for 1.8 release # so for now just try loading the dataframe without specifying dtype data = pd.read_csv( filepath_or_buffer=filename_or_stream, encoding=encoding, skipinitialspace=True ) logging.info('File read via the pandas read_csv methodology.') # coerces pandas nulls (of np.NaN type) into python None data = data.where((pd.notnull(data)), None) # coerces string representations of Python None to a real Python None data[data == 'None'] = None data[data == ''] = None logging.info("Dataframe of shape %s has been retrieved." % str(data.shape)) return data
Example #18
Source File: Trajectory.py From TrajLib with Apache License 2.0 | 5 votes |
def load_data(self, **kwargs): # lat='lat',lon='lon',alt='alt',timeDate='timeDate',labels=['label1'],src='~/gps_fe/bigdata2_8696/ex_traj/5428_walk_790.csv',seperator=',' print('loading...') lat = kwargs.get('lat', "lat") print(lat) lon = kwargs.get('lon', "lon") print(lon) alt = kwargs.get('alt', None) print(alt) time_date = kwargs.get('timeDate', "timeDate") print(time_date) labels = kwargs.get('labels', "[label]") print(labels) src = kwargs.get('src', "~/gps_fe/bigdata2_8696/ex_traj/5428_walk_790.csv") print(src) separator = kwargs.get('separator', ",") print(separator) self.labels = labels # input data needs lat,lon,alt,timeDate, [Labels] self.raw_data = pd.read_csv(src, sep=separator, parse_dates=[time_date], index_col=time_date) self.raw_data.rename(columns={lat: 'lat'}, inplace=True) self.raw_data.rename(columns={lon: 'lon'}, inplace=True) if alt is not None: self.raw_data.rename(columns={alt: 'alt'}, inplace=True) self.raw_data.rename(columns={time_date: 'timeDate'}, inplace=True) # preprocessing # removing NaN in lat and lon self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lat), :] self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data.lon), :] for label in labels: self.raw_data = self.raw_data.loc[pd.notnull(self.raw_data[label]), :] print('Data loaded.') return self.raw_data
Example #19
Source File: crypto_data.py From investpy with MIT License | 5 votes |
def cryptos_as_list(): """ This function retrieves all the crypto coin names stored in `cryptos.csv` file, which contains all the data from the crypto coins as previously retrieved from Investing.com. So on, this function will just return the crypto coin names which will be the main input parameters when it comes to crypto data retrieval functions from investpy. Note that just some cryptos are available for retrieval, since Investing.com does not provide information from all the available ones, just the main ones. Returns: :obj:`list` - cryptos_list: The resulting :obj:`list` contains the all the available crypto coin names as indexed in Investing.com from the information previously retrieved by investpy and stored on a csv file. In case the information was successfully retrieved, the :obj:`list` of crypto coin names will look like:: cryptos_list = ['Bitcoin', 'Ethereum', 'XRP', 'Bitcoin Cash', 'Tether', 'Litecoin', ...] Raises: FileNotFoundError: raised if `cryptos.csv` file was not found. IOError: raised when `cryptos.csv` file is missing or empty. """ resource_package = 'investpy' resource_path = '/'.join(('resources', 'crypto', 'cryptos.csv')) if pkg_resources.resource_exists(resource_package, resource_path): cryptos = pd.read_csv(pkg_resources.resource_filename(resource_package, resource_path)) else: raise FileNotFoundError("ERR#0081: cryptos file not found or errored.") if cryptos is None: raise IOError("ERR#0082: cryptos not found or unable to retrieve.") cryptos = cryptos[cryptos['status'] == 'available'] cryptos.drop(columns=['tag', 'id', 'status'], inplace=True) cryptos = cryptos.where(pd.notnull(cryptos), None) return cryptos['name'].tolist()
Example #20
Source File: utils.py From reportgen with MIT License | 5 votes |
def categorical_order(values, order=None): """Return a list of unique data values. Determine an ordered list of levels in ``values``. Parameters ---------- values : list, array, Categorical, or Series Vector of "categorical" values order : list-like, optional Desired order of category levels to override the order determined from the ``values`` object. Returns ------- order : list Ordered list of category levels not including null values. """ if order is None: if hasattr(values, "categories"): order = values.categories else: try: order = values.cat.categories except (TypeError, AttributeError): try: order = values.unique() except AttributeError: order = pd.unique(values) try: np.asarray(values).astype(np.float) order = np.sort(order) except (ValueError, TypeError): order = order order = filter(pd.notnull, order) return list(order)
Example #21
Source File: solution.py From Kaggle with MIT License | 5 votes |
def pre_processData(train_data,file_path): train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age) # 为空的年龄补为平均年龄 train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin不为空的设为yes train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no' '''0/1对应处理''' dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin') # get_dummies返回对应的0/1格式的数据,有几类返回几列,prefix指定为Cabin dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass') train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1) # 拼接dataframe,axis=1为列 train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True) # 删除之前没有处理的数据列 header_string = ','.join(train_data.columns.tolist()) # 将列名转为string,并用逗号隔开 np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string) # 预处理数据保存到指定目录下 '''均值归一化处理(Age和Fare)''' scaler = StandardScaler() age_scaler = scaler.fit(train_data['Age']) train_data['Age'] = age_scaler.fit_transform(train_data['Age']) if np.sum(train_data.Fare.isnull()): # 如果Fare中有为空的,就设为均值 train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare) fare_scaler = scaler.fit(train_data['Fare']) train_data['Fare'] = fare_scaler.transform(train_data['Fare']) header_string = ','.join(train_data.columns.tolist()) # 将列名转为string,并用逗号隔开 np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string) # 预处理数据保存到指定目录下 return train_data ## feature engineering:特征工程-预处理数据
Example #22
Source File: utils_graph.py From osmnx with MIT License | 5 votes |
def graph_from_gdfs(gdf_nodes, gdf_edges, graph_attrs=None): """ Convert node and edge GeoDataFrames to a MultiDiGraph. This function is the inverse of `graph_to_gdfs`. Parameters ---------- gdf_nodes : geopandas.GeoDataFrame GeoDataFrame of graph nodes gdf_edges : geopandas.GeoDataFrame GeoDataFrame of graph edges, must have crs attribute set graph_attrs : dict the new G.graph attribute dict; if None, add crs as the only graph-level attribute Returns ------- G : networkx.MultiDiGraph """ if graph_attrs is None: graph_attrs = {"crs": gdf_edges.crs} G = nx.MultiDiGraph(**graph_attrs) # add the nodes then each node's non-null attributes G.add_nodes_from(gdf_nodes.index) for col in gdf_nodes.columns: nx.set_node_attributes(G, name=col, values=gdf_nodes[col].dropna()) # add each edge and its non-null attributes for (u, v, k), row in gdf_edges.set_index(["u", "v", "key"]).iterrows(): d = {label: val for label, val in row.items() if isinstance(val, list) or pd.notnull(val)} G.add_edge(u, v, k, **d) utils.log("Created graph from node/edge GeoDataFrames") return G
Example #23
Source File: history_container.py From zipline-chinese with Apache License 2.0 | 5 votes |
def update_last_known_values(self): """ Store the non-NaN values from our oldest frame in each frequency. """ ffillable = self.ffillable_fields if not len(ffillable): return for frequency in self.unique_frequencies: digest_panel = self.digest_panels.get(frequency, None) if digest_panel: oldest_known_values = digest_panel.oldest_frame(raw=True) else: oldest_known_values = self.buffer_panel.oldest_frame(raw=True) oldest_vals = oldest_known_values oldest_columns = self.fields for field in ffillable: f_idx = oldest_columns.get_loc(field) field_vals = oldest_vals[f_idx] # isnan would be fast, possible to use? non_nan_sids = np.where(pd.notnull(field_vals)) key = (frequency.freq_str, field) key_loc = self.last_known_prior_values.index.get_loc(key) self.last_known_prior_values.values[ key_loc, non_nan_sids ] = field_vals[non_nan_sids]
Example #24
Source File: label_encoder_for_multi_fit.py From KDD-Cup-AutoML-5 with MIT License | 5 votes |
def fit(self, data): assert isinstance(data, pd.Series) data = data[pd.notnull(data)] self.labels = self.labels | set(data.tolist())
Example #25
Source File: intent_classifier.py From SKF-Chatbot with GNU Affero General Public License v3.0 | 5 votes |
def data_prepare(): col = ['classs', 'question'] y=get_data() y = y[col] y = y[pd.notnull(y['question'])] y.columns = ['classs', 'question'] y['category_id'] = y['classs'].factorize()[0] category_id_df = y[['classs', 'category_id']].drop_duplicates().sort_values('category_id') category_to_id = dict(category_id_df.values) id_to_category = dict(category_id_df[['category_id', 'classs']].values) return y
Example #26
Source File: transform.py From marcotti with MIT License | 5 votes |
def venues(self, data_frame): lambdafunc = lambda x: pd.Series([ self.get_id(mco.Countries, name=x['country']), self.get_id(mco.Timezones, name=x['timezone']), self.get_id(mco.Surfaces, description=x['surface']), self.make_date_object(x['config_date']) ]) ids_frame = data_frame.apply(lambdafunc, axis=1) ids_frame.columns = ['country_id', 'timezone_id', 'surface_id', 'eff_date'] joined_frame = data_frame.join(ids_frame).drop(['country', 'timezone', 'surface', 'config_date'], axis=1) new_frame = joined_frame.where((pd.notnull(joined_frame)), None) return new_frame
Example #27
Source File: test_data_cleaner.py From data-cleaner with MIT License | 5 votes |
def nan_to_empty_string_list(iterable): """Retorna una lista convirtiendo valores nulos a None.""" return [i if pd.notnull(i) else "" for i in iterable]
Example #28
Source File: apply_matcher.py From py_stringsimjoin with BSD 3-Clause "New" or "Revised" License | 5 votes |
def generate_tokens(table, key_attr, join_attr, tokenizer): table_nonnull = table[pd.notnull(table[join_attr])] return dict(zip(table_nonnull[key_attr], table_nonnull[join_attr].apply(tokenizer.tokenize)))
Example #29
Source File: data_cleaner.py From data-cleaner with MIT License | 5 votes |
def _split(value, separators): values = [] for separator in separators: if separator in str(value): values = [str(split_value) for split_value in value.split(separator)] break return pd.Series([str(value).strip() for value in values if pd.notnull(value)])
Example #30
Source File: merge_pbp_shifts.py From Hockey-Scraper with GNU General Public License v3.0 | 5 votes |
def merge(pbp_df, shifts_df): """ Merge the shifts_df into the pbp_df. :param pbp_df: Play by Play DataFrame :param shifts_df: Shift Tables DataFrame :return: Play by Play DataFrame with shift info embedded """ # To get the final pbp columns in the "correct" order pbp_columns = pbp_df.columns shifts_df['Player_Id'] = shifts_df['Player_Id'].astype(int) # Get unique game_id -> teams pair for placing in Shifts_df pbp_unique = pbp_df.drop_duplicates(subset=['Game_Id', 'Home_Team', 'Away_Team'])[['Game_Id', 'Home_Team', 'Away_Team']] # Group up shifts that start/end at the same time new_shifts = group_shifts(pbp_unique, shifts_df) new_shifts = new_shifts.where((pd.notnull(new_shifts)), None) # Add in & order rows new_pbp = pbp_df.append(new_shifts).reset_index(drop=True) new_pbp['Priority'] = new_pbp.apply(label_priority, axis=1) new_pbp = new_pbp.sort_values(by=['Game_Id', 'Period', 'Seconds_Elapsed', 'Priority']) return new_pbp[pbp_columns]