Python pandas.dataframe() Examples
The following are 30
code examples of pandas.dataframe().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_account(self, format=""): """ return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_account", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Account")
Example #2
Source File: sample_based_voting.py From lale with Apache License 2.0 | 6 votes |
def transform(self, X, end_index_list = None): if end_index_list is None: end_index_list = self.end_index_list # in case the end_index_list was set as meta_data if end_index_list is None: return X else: voted_labels = [] prev_index = 0 if not isinstance(X, np.ndarray): if isinstance(X, list): X = np.array(X) elif isinstance(X, pd.dataframe): X = X.as_matrix() for index in end_index_list: labels = X[prev_index:index] (values,counts) = np.unique(labels,return_counts=True) ind=np.argmax(counts) #If two labels are in majority, this will pick the first one. voted_labels.append(ind) return np.array(voted_labels)
Example #3
Source File: pdutils.py From pysystemtrade with GNU General Public License v3.0 | 6 votes |
def from_dict_of_values_to_df(data_dict, ts_index, columns=None): """ Turn a set of fixed values into a pd.dataframe :param data_dict: A dict of scalars :param ts_index: A timeseries index :param columns: (optional) A list of str to align the column names to [must have entries in data_dict keys] :return: pd.dataframe, column names from data_dict, values repeated scalars """ if columns is None: columns = data_dict.keys() columns_as_list = list(columns) numeric_values = dict([(keyname, [data_dict[keyname]] * len(ts_index)) for keyname in columns_as_list]) pd_dataframe = pd.DataFrame(numeric_values, ts_index) return pd_dataframe
Example #4
Source File: pdutils.py From pysystemtrade with GNU General Public License v3.0 | 6 votes |
def dataframe_pad(starting_df, column_list, padwith=0.0): """ Takes a dataframe and adds extra columns if neccessary so we end up with columns named column_list :param starting_df: A pd.dataframe with named columns :param column_list: A list of column names :param padwith: The value to pad missing columns with :return: pd.Dataframe """ def _pad_column(column_name, starting_df, padwith): if column_name in starting_df.columns: return starting_df[column_name] else: return pd.Series([0.0] * len(starting_df.index), starting_df.index) new_data = [ _pad_column(column_name, starting_df, padwith) for column_name in column_list ] new_df = pd.concat(new_data, axis=1) new_df.columns = column_list return new_df
Example #5
Source File: classification.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def check_params_with_data(self, df, actual_field, predicted_field): """ Check parameters against ground-truth values. Handle errors regarding cardinality of ground-truth labels and check pos_label param, if applicable. Assumed data has already been cleaned and made categorical. Overwritten as needed. Args: df (pd.dataframe): input dataframe actual_field (str): name of ground-truth field predicted_field (str): name of predicted field Raises: RuntimeError if params are incompatible with passed data """ msg = 'Scoring method {} does not support "check_params_with_data" method.' raise MLSPLNotImplementedError(msg.format(self.scoring_name))
Example #6
Source File: classification.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def score(self, df, options): """ Compute the score. Args: df (pd.DataFrame): input dataframe options (dict): passed options Returns: df_output (pd.dataframe): output dataframe """ # Prepare ground-truth and predicted labels y_actual, y_predicted = self.prepare_input_data(df, self.actual_field, self.predicted_field, options) # Get the scoring result result = self.scoring_function(y_actual, y_predicted, **self.params) # Create the output df df_output = self.create_output(self.scoring_name, result) return df_output
Example #7
Source File: classification.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def create_output(self, scoring_name, result): """ Create output dataframe Args: scoring_name (str): scoring function name result (float, dict or array): output of sklearn scoring function Returns: output_df (pd.DataFrame): output dataframe """ labels = self.params.get('labels', None) if labels is not None: # labels is union of predicted & actual classes. (eg. average=none, confusion matrix) output_df = pd.DataFrame(data=[result], columns=labels) else: # otherwise, use scoring name output_df = pd.DataFrame(data=[result], columns=[scoring_name]) return output_df
Example #8
Source File: solar_equations.py From CityEnergyAnalyst with MIT License | 6 votes |
def calc_worst_hour(latitude, weather_data, solar_window_solstice): """ Calculate the first hour of solar window of the winter solstice for panel spacing. http://www.affordable-solar.com/learning-center/building-a-system/calculating-tilted-array-spacing/ :param latitude: latitude of the site [degree] :type latitude: float :param weather_data: weather data of the site :type weather_data: pd.dataframe :param solar_window_solstice: the desired hour of shade-free solar window on the winter solstice. :type solar_window_solstice: floar :return worst_hour: the hour to calculate minimum spacing :rtype worst_hour: float """ if latitude > 0: northern_solstice = weather_data.query('month == 12 & day == 21') worst_hour = northern_solstice[northern_solstice.hour == (12 - round(solar_window_solstice / 2))].index[0] else: southern_solstice = weather_data.query('month == 6 & day == 21') worst_hour = southern_solstice[southern_solstice.hour == (12 - round(solar_window_solstice / 2))].index[0] return worst_hour
Example #9
Source File: classification.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def create_output(self, scoring_name, result): """ Output dataframe differs from parent. The output shape of precision_recall_fscore_support depends on the average value. If average!=None, output is 1x4. If average=None, output is nx4 where n is the number of unique classes in y_actual and y_predicted. """ # Labels is populated when average=None. In this case, metrics are computed for each target class. labels = self.params.get('labels', None) if labels is not None: stacked_array = np.vstack(result) # n x 4 index_labels = np.array(['precision', 'recall', 'fbeta_score', 'support']).reshape(-1, 1) output_array = np.hstack((index_labels, stacked_array)) col_labels = ['metric'] + ['scored({})'.format(i) for i in labels] # named for alphabetical sorting output_df = pd.DataFrame(data=output_array, columns=col_labels) else: array = np.array(result).reshape(1, -1) # 1 x 4 output_df = pd.DataFrame(data=array, columns=['precision', 'recall', 'fbeta_score', 'support']) return output_df
Example #10
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_portfolio(self, format=""): """ return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("pms.query_portfolio", rpc_params) return utils.extract_result(cr, index_column="security", data_format=data_format, class_name="NetPosition")
Example #11
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_trade(self, task_id=-1, format=""): """ task_id: -1 -- all return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"task_id": task_id} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_trade", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Trade")
Example #12
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_task(self, task_id=-1, format=""): """ task_id: -1 -- all return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"task_id": task_id} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_task", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Task")
Example #13
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_net_position(self, mode="all", securities="", format=""): """ securities: seperate by "," return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"mode" : mode, "security" : securities} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_net_position", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="NetPosition")
Example #14
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_position(self, mode="all", securities="", format=""): """ securities: seperate by "," return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"mode" : mode, "security" : securities} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_position", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Position")
Example #15
Source File: trade_api.py From TradeSim with Apache License 2.0 | 6 votes |
def query_account(self, format=""): """ return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_account", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Account")
Example #16
Source File: model_datasets.py From AMPL with MIT License | 6 votes |
def load_featurized_data(self): """Loads prefeaturized data from the filesystem. Returns a data frame, which is then passed to featurization.extract_prefeaturized_data() for processing. Returns: featurized_dset_df (pd.DataFrame): dataframe of the prefeaturized data, needs futher processing """ # First check to set if dataset already has the feature columns we need dset_df = self.load_full_dataset() if self.has_all_feature_columns(dset_df): self.dataset_key = self.params.dataset_key return dset_df # Otherwise, generate the expected path for the featurized dataset featurized_dset_name = self.featurization.get_featurized_dset_name(self.dataset_name) dataset_dir = os.path.dirname(self.params.dataset_key) data_dir = os.path.join(dataset_dir, self.featurization.get_featurized_data_subdir()) featurized_dset_path = os.path.join(data_dir, featurized_dset_name) featurized_dset_df = pd.read_csv(featurized_dset_path) self.dataset_key = featurized_dset_path return featurized_dset_df # ****************************************************************************************
Example #17
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_position(self, mode="all", securities="", format=""): """ securities: seperate by "," return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"mode" : mode, "security" : securities} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_position", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Position")
Example #18
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_net_position(self, mode="all", securities="", format=""): """ securities: seperate by "," return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"mode" : mode, "security" : securities} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_net_position", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="NetPosition")
Example #19
Source File: model_datasets.py From AMPL with MIT License | 6 votes |
def has_all_feature_columns(self, dset_df): """ Compare the columns in dataframe dset_df against the feature columns required by the current featurization and descriptor_type param. Returns True if dset_df contains all the required columns. Args: dset_df (DataFrame): Feature matrix Returns: (Boolean): boolean specifying whether there are any missing columns in dset_df """ missing_cols = set(self.featurization.get_feature_columns()) - set(dset_df.columns.values) return (len(missing_cols) == 0) # *************************************************************************************
Example #20
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_task(self, task_id=-1, format=""): """ task_id: -1 -- all return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"task_id": task_id} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_task", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Task")
Example #21
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_trade(self, task_id=-1, format=""): """ task_id: -1 -- all return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {"task_id": task_id} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("oms.query_trade", rpc_params) return utils.extract_result(cr, data_format=data_format, class_name="Trade")
Example #22
Source File: trade_api.py From TradeApi with Apache License 2.0 | 6 votes |
def query_portfolio(self, format=""): """ return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {} data_format = self._get_format(format, "pandas") if data_format == "pandas": rpc_params["format"] = "columnset" cr = self._remote.call("pms.query_portfolio", rpc_params) return utils.extract_result(cr, index_column="security", data_format=data_format, class_name="NetPosition")
Example #23
Source File: bivariate.py From btgym with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): """ Args: model_params: dict holding generative model parameters, same as args for: bivariate_state_set_iterator_fn assets_filenames: dict. of two keys in form of: {'asset_name`: 'data_file_name'}, test data or None, ignored when `assets_dataframes` arg. is given assets_dataframes: dict. of two keys in form of {'asset_name`: pd.dataframe}, an alternative way to provide test data as pandas.dataframes instances, overrides `assets_filenames` train_episode_duration: dict of keys {'days', 'hours', 'minutes'} - train sample duration test_episode_duration: dict of keys {'days', 'hours', 'minutes'} - test sample duration """ super().__init__(*args, _train_class_ref=BivariateStateSetGenerator, **kwargs)
Example #24
Source File: classification.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def prepare_input_data(self, df, actual_field, predicted_field, options): """ Prepare the data prior to scoring. Preprocess input data, perform parameter validation and handles errors. Overwritten as needed. Args: df (pd.dataframe): input dataframe actual_field (str): ground-truth labels field name predicted_field (str): predicted labels field name options (dict): input options Returns: y_actual (np.array): preprocessed ground-truth labels y_predicted (np.array): preprocessed predicted labels """ # remove nans and check limits clean_df = prepare_classification_scoring_data(df, actual_field, predicted_field, options.get('mlspl_limits', None)) # convert to str if needed categorical_df = make_categorical(clean_df, [actual_field, predicted_field]) # Check for inconsistencies with data self.check_params_with_data(categorical_df, actual_field, predicted_field) # warn if no intersection of actual/predicted fields check_class_intersection(categorical_df, actual_field, predicted_field) if self._meta_params.get('all_labels', False): # when average=None or for confusion matrix self.params['labels'] = get_union_of_field_values(categorical_df, [actual_field, predicted_field]) y_actual, y_predicted = categorical_df[actual_field].values, categorical_df[predicted_field].values return y_actual, y_predicted
Example #25
Source File: trade_api.py From TradeApi with Apache License 2.0 | 5 votes |
def query_repo_contract(self, format=""): """ securities: seperate by "," return pd.dataframe """ r, msg = self._check_session() if not r: return (None, msg) rpc_params = {} cr = self._remote.call("oms.query_repo_contract", rpc_params) return utils.extract_result(cr, data_format=self._get_format(format, "pandas"), class_name="RepoContract")
Example #26
Source File: feature_utils.py From 2020plus with Apache License 2.0 | 5 votes |
def process_mutational_features(mydf): """Performs feature processing pipeline. Parameters ---------- mydf : pd.DataFrame data frame containing the desired raw data for computation of features for classifier Returns ------- proc_feat_df: pd.DataFrame dataframe consisting of features for classification """ # rename process of columns to ensure compatability with previously # written code mydf = mydf.rename(columns={'Protein_Change': 'AminoAcid', 'DNA_Change': 'Nucleotide'}) # process features feat_list = fmat.generate_feature_matrix(mydf, 2) headers = feat_list.pop(0) # remove header row feat_df = pd.DataFrame(feat_list, columns=headers) # convert to data frame proc_feat_df = normalize_mutational_features(feat_df, 0) miss_ent_df = pentropy.missense_position_entropy(mydf[['Gene', 'AminoAcid']]) # mut_ent_df = pentropy.mutation_position_entropy(mydf[['Gene', 'AminoAcid']]) # encorporate entropy features #proc_feat_df['mutation position entropy'] = mut_ent_df['mutation position entropy'] #proc_feat_df['pct of uniform mutation entropy'] = mut_ent_df['pct of uniform mutation entropy'] proc_feat_df['missense position entropy'] = miss_ent_df['missense position entropy'] proc_feat_df['pct of uniform missense entropy'] = miss_ent_df['pct of uniform missense entropy'] return proc_feat_df
Example #27
Source File: feature_utils.py From 2020plus with Apache License 2.0 | 5 votes |
def random_sort(df, prng=None): """Randomly shuffle a DataFrame. NOTE: if the training data is not randomly shuffled, then supervised learning may find artifacts related to the order of the data. Parameters ---------- df : pd.DataFrame dataframe with feature information Returns ------- df : pd.DataFrame Randomly shuffled data frame """ # get new random state if not specified if prng is None: prng = np.random.RandomState() # get random order random_indices = prng.choice(df.index.values, # sample from 'genes' len(df), # number of samples replace=False) # sample without replacement # change order of df random_df = df.loc[random_indices].copy() return random_df
Example #28
Source File: QualityModule.py From staramr with Apache License 2.0 | 5 votes |
def _get_num_contigs_over_minimum_bp_feedback(self,files_contigs_lengths,minimum_contig_length,unacceptable_num_contigs_over_minimum_bp): """ Goes through the files and determines whether or not they pass the quality metrics for the acceptable number of contigs equal to or above the minimum contig length :param files_contigs_lengths: The lengths of the contigs for the files :param minimum_contig_length: The minimum contig length as defined by the user for quality metrics :param unacceptable_num_contigs: The number of contigs in a file, equal to or above our minimum contig length, for which to raise a flag as defined by the user for quality metrics :return: An array where the first element is itself an array where each element is the number of contigs equal to or above the minimum contig length for the corresponding file. The second element is itself an array where each element is the feedback (True or False) for whether the corresponding file passes the acceptable number of contigs equal to or above the minimum contig length quality metric """ #This array is what we will return and will contain for each of the files the number of contigs of length greater than or equal to the minimum contig length #as well as the feedback for whether or not this number of contigs is greater than or equal to the unacceptable number of contigs and thus wether it passes or fails the quality metric #this array will be used to construct our quality module dataframe feedback=[] #This array will contain the number of contigs of length greater than or equal to the minimum contig length for the files, it will be returned as the first element of feedback file_num_contigs=[] #This array will contain the feedback of either True or False for whether or not the corresponding files pass the quality metrics for the acceptable number of contigs equal to #or above the minimum contig length, it will be returned as the second element of feedback contigs_over_minimum_bp_feedback=[] for file_contigs_lengths in files_contigs_lengths: num_contigs = 0 for contig in file_contigs_lengths: if contig >= minimum_contig_length: num_contigs = num_contigs+1 file_num_contigs.append(num_contigs) for file_num_contigs_over_minimum_bp in file_num_contigs: if file_num_contigs_over_minimum_bp >= unacceptable_num_contigs_over_minimum_bp: contigs_over_minimum_bp_feedback.append(False) else: contigs_over_minimum_bp_feedback.append(True) feedback.append(file_num_contigs) feedback.append(contigs_over_minimum_bp_feedback) return feedback
Example #29
Source File: QualityModule.py From staramr with Apache License 2.0 | 5 votes |
def _get_genome_length_feedback(self,files_genome_lengths,lb_gsize,ub_gsize): """ Goes through the files and determines whether or not they pass the quality metrics for genome length :param files_genome_lengths: An array where each element is the genome length of the corresponding file :param lb_gsize: The lower bound for the genome size as defined by the user for quality metrics :param ub_gsize: The upper bound for the genome size as defined by the user for quality metrics :return: An array where each element corresponds to the feedback (true or false) for the corresponding file in regards to the genome size quality metric """ #The array contains the feedback of either True or false for whether or not the corresponding files pass the genome length quality metric, and #this feedback will be used to construc our quality module dataframe files_genome_feedback=[genome_length >= lb_gsize and genome_length <= ub_gsize for genome_length in files_genome_lengths] return files_genome_feedback
Example #30
Source File: QualityModule.py From staramr with Apache License 2.0 | 5 votes |
def create_quality_module_dataframe(self): """ Goes through the files and creates a dataframe consisting of the file's genome length, N50 value and the number of contigs greater or equal to the minimum contig length as specified by the quality metrics. It also consists of the feedback for whether or not the file passed the quality metrics and if it didn't feedback on why it failed :return: A pd.dataframe containing the genome size, N50 value, number of contigs equal to or above our user defined minimum contig length as well as the results of our quality metrics (pass or fail) and the corresponding feedback """ name_set=[] for myFile in self._files: name_set.append(path.splitext(path.basename(myFile))[0]) files_contigs_and_genomes_lengths=self._get_files_contigs_and_genomes_lengths(self._files) files_genome_length_feedback = self._get_genome_length_feedback(files_contigs_and_genomes_lengths[1],self._genome_size_lower_bound,self._genome_size_upper_bound) files_N50_value_feedback=self._get_N50_feedback(files_contigs_and_genomes_lengths[0],files_contigs_and_genomes_lengths[1],self._minimum_N50_value) file_num_contigs_over_minimum_bp_feedback= self._get_num_contigs_over_minimum_bp_feedback(files_contigs_and_genomes_lengths[0],self._minimum_contig_length,self._unacceptable_num_contigs) quality_module = self._get_quality_module(files_genome_length_feedback,files_N50_value_feedback[1],file_num_contigs_over_minimum_bp_feedback[1]) quality_module_feedback = quality_module[0] quality_module_result = quality_module[1] #Module to represent our quality metric values, the index which is used to merge this module and the feedback module is the file names quality_metrics_module = pd.DataFrame([[file_name,genome_length,N50_value,num_contigs_over_minimum_bp] for file_name,genome_length,N50_value,num_contigs_over_minimum_bp in zip(name_set,files_contigs_and_genomes_lengths[1],files_N50_value_feedback[0],file_num_contigs_over_minimum_bp_feedback[0])], columns=('Isolate ID','Genome Length','N50 value','Number of Contigs Greater Than Or Equal To '+str(self._minimum_contig_length)+' bp')).set_index('Isolate ID') #Module to represent the feedback for our quality metrics, the index which is used to merge this module and the quality metric value module is the file names feedback_module = pd.DataFrame([[file_name,feedback,detailed_feedback] for file_name,feedback,detailed_feedback in zip(name_set,quality_module_result,quality_module_feedback)], columns=('Isolate ID','Quality Module','Quality Module Feedback')).set_index('Isolate ID') #Module to represent out quality metric values and their corresponding feedback quality_module_frame=quality_metrics_module.merge(feedback_module, on='Isolate ID', how='left') return quality_module_frame