Python config.LOG_DIR Examples
The following are 30
code examples of config.LOG_DIR().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
config
, or try the search function
.
Example #1
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_char_dist_sim(): logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #2
Source File: test.py From PINTO_model_zoo with MIT License | 6 votes |
def __init__(self, test_weight): log_dir = os.path.join(cfg.LOG_DIR, 'test') test_weight_path = os.path.join(cfg.WEIGHTS_DIR, test_weight) with tf.name_scope('input'): input_data = tf.placeholder(dtype=tf.float32, name='input_data') training = tf.placeholder(dtype=tf.bool, name='training') _, _, _, pred_sbbox, pred_mbbox, pred_lbbox = YOLOV3(training).build_nework(input_data) with tf.name_scope('summary'): tf.summary.FileWriter(log_dir).add_graph(tf.get_default_graph()) self.__sess = tf.Session() net_vars = tf.get_collection('YoloV3') saver = tf.train.Saver(net_vars) saver.restore(self.__sess, test_weight_path) super(Yolo_test, self).__init__(self.__sess, input_data, training, pred_sbbox, pred_mbbox, pred_lbbox) print("input_data.name=", input_data.name) print("pred_sbbox=", pred_sbbox.name) print("pred_mbbox=", pred_mbbox.name) print("pred_lbbox=", pred_lbbox.name)
Example #3
Source File: eval.py From NFETC with MIT License | 6 votes |
def main(options): if options.epoch: time_str = datetime.datetime.now().isoformat() logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) else: time_str = datetime.datetime.now().isoformat() logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) # logger = logging.getLogger() # logging.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=logging.INFO) params_dict = param_space_dict[options.model_name] task = Task(options.model_name, options.data_name, options.runs, params_dict, logger) if options.save: task.save() else: if options.epoch: task.refit() else: task.evaluate(options.full)
Example #4
Source File: feature_distance.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_compression_distance(): logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() for ngram in ngrams: param_list = [ngram, aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #5
Source File: feature_distance.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #6
Source File: feature_combiner.py From kaggle-HomeDepot with MIT License | 6 votes |
def __init__(self, feature_list, feature_name, feature_suffix=".csv", feature_level=2, meta_feature_dict={}, corr_threshold=0): self.feature_name = feature_name self.feature_list = feature_list self.feature_suffix = feature_suffix self.feature_level = feature_level # for meta features self.meta_feature_dict = meta_feature_dict self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.feature_names = [] self.has_basic = 1 if self.meta_feature_dict else 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) if self.feature_level == 2: self.splitter = splitter_level2 elif self.feature_level == 3: self.splitter = splitter_level3 self.n_iter = n_iter self.splitter_prev = [0]*self.n_iter
Example #7
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_lsa_ngram_cosinesim(): logname = "generate_feature_lsa_ngram_cosinesim_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_CosineSim, LSA_Char_Ngram_CosineSim] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #8
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_lsa_ngram_pair(): logname = "generate_feature_lsa_ngram_pair_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # memory error (use feature_tsne.R instead)
Example #9
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_lsa_ngram_cooc(): logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram_Cooc] obs_ngrams = [1, 2] target_ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description"][:1] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_ngram in obs_ngrams: for target_ngram in target_ngrams: for generator in generators: param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #10
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def run_lsa_ngram(): logname = "generate_feature_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram, LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] obs_fields = ["search_term", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
Example #11
Source File: extreme_ensemble_selection.py From kaggle-HomeDepot with MIT License | 6 votes |
def main(options): # create sub folder subm_folder = "%s/ensemble_selection"%config.SUBM_DIR os_utils._create_dirs( [subm_folder] ) subm_prefix = "%s/test.pred.[%s]" % (subm_folder, options.outfile) # get model list log_folder = "%s/level%d_models"%(config.LOG_DIR, options.level-1) model_list = get_model_list(log_folder, options.size) # get instance splitter if options.level not in [2, 3]: inst_splitter = None elif options.level == 2: inst_splitter = splitter_level2 elif options.level == 3: inst_splitter = splitter_level3 ees = ExtremeEnsembleSelection( model_folder=config.OUTPUT_DIR, model_list=model_list, subm_prefix=subm_prefix, weight_opt_max_evals=options.weight_opt_max_evals, w_min=-1., w_max=1., inst_subsample=options.inst_subsample, inst_subsample_replacement=options.inst_subsample_replacement, inst_splitter=inst_splitter, model_subsample=options.model_subsample, model_subsample_replacement=options.model_subsample_replacement, bagging_size=options.bagging_size, init_top_k=options.init_top_k, epsilon=options.epsilon, multiprocessing=False, multiprocessing_num_cores=config.NUM_CORES, enable_extreme=options.enable_extreme, random_seed=config.RANDOM_SEED ) ees.go()
Example #12
Source File: feature_wordnet_similarity.py From kaggle-HomeDepot with MIT License | 6 votes |
def main(): logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission generators = [ WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity, ][:1] obs_fields_list = [] target_fields_list = [] # only search_term and product_title are used in final submission obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute"][:1] ) # double aggregation aggregation_mode_prev = ["mean", "max", "min", "median"] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [aggregation_mode_prev, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #13
Source File: feature_group_relevance.py From kaggle-HomeDepot with MIT License | 5 votes |
def main(): logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
Example #14
Source File: task.py From NFETC with MIT License | 5 votes |
def main(options): time_str = datetime.datetime.now().isoformat() logname = "[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.model_name, options.data_name, options.cv_runs, options.max_evals, logger) optimizer.run()
Example #15
Source File: get_stacking_feature_conf.py From kaggle-HomeDepot with MIT License | 5 votes |
def _create_feature_conf(level, topN, outfile): log_folder = "%s/level%d_models"%(config.LOG_DIR, level) feature_list = get_model_list(log_folder, topN) res = header_pattern%(__file__, level, int(topN), outfile) for feature in feature_list: res += '"%s",\n'%feature res += "]\n" with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f: f.write(res)
Example #16
Source File: task.py From HRERE with MIT License | 5 votes |
def main(options): time_str = datetime.datetime.now().isoformat() logname = "[Model@%s]_%s.log" % (options.model_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.model_name, options.max_evals, options.runs, logger) optimizer.run()
Example #17
Source File: eval.py From HRERE with MIT License | 5 votes |
def main(options): time_str = datetime.datetime.now().isoformat() logname = "Final_[Model@%s]_%s.log" % (options.model_name, time_str) logger = logging_utils._get_logger(config.LOG_DIR, logname) params_dict = param_space_dict[options.model_name] task = Task(options.model_name, options.runs, params_dict, logger) task.evaluate(options.prefix)
Example #18
Source File: plot_feature_corr.py From kaggle-HomeDepot with MIT License | 5 votes |
def grap_feat_corr_dict(fname): d = {} with open("%s/feature/%s"%(config.LOG_DIR, fname), "r") as f: for line in f: corr = grap_feat_line_corr(line) if corr is not None: name = grap_feat_line_name(line) d[name] = (corr) return d.values()
Example #19
Source File: feature_match.py From kaggle-HomeDepot with MIT License | 5 votes |
def main(): logname = "generate_feature_match_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ MatchQueryCount, MatchQueryRatio, LongestMatchSize, LongestMatchRatio, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() # product_attribute_list generators = [ MatchAttrCount, MatchAttrRatio, IsIndoorOutdoorMatch, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_attribute_list"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #20
Source File: feature_first_last_ngram.py From kaggle-HomeDepot with MIT License | 5 votes |
def run_count(): logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ FirstIntersectCount_Ngram, LastIntersectCount_Ngram, FirstIntersectRatio_Ngram, LastIntersectRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #21
Source File: task.py From kaggle-HomeDepot with MIT License | 5 votes |
def main(options): logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%( options.feature_name, options.learner_name, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) optimizer = TaskOptimizer(options.task_mode, options.learner_name, options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once, plot_importance=options.plot_importance) optimizer.run()
Example #22
Source File: feature_group_distance.py From kaggle-HomeDepot with MIT License | 5 votes |
def main(): logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3] relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3] ngrams = [1] obs_fields = ["search_term"] target_fields = ["product_title", "product_description"] aggregation_mode = ["mean", "std", "max", "min", "median"] ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) for target_field in target_fields: for relevance in relevances: for ngram in ngrams: param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode] pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger) pf.go()
Example #23
Source File: feature_combiner.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0): self.feature_name = feature_name self.feature_dict = feature_dict self.feature_suffix = feature_suffix self.corr_threshold = corr_threshold self.feature_names_basic = [] self.feature_names_cv = [] self.feature_names = [] self.basic_only = 0 logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp()) self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.splitter = splitter_level1 self.n_iter = n_iter
Example #24
Source File: extreme_ensemble_selection.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, model_folder, model_list, subm_prefix, weight_opt_max_evals=10, w_min=-1., w_max=1., inst_subsample=0.5, inst_subsample_replacement=False, inst_splitter=None, model_subsample=1.0, model_subsample_replacement=True, bagging_size=10, init_top_k=5, epsilon=0.00001, multiprocessing=False, multiprocessing_num_cores=1, enable_extreme=True, random_seed=0): self.model_folder = model_folder self.model_list = model_list self.subm_prefix = subm_prefix self.weight_opt_max_evals = weight_opt_max_evals self.w_min = w_min self.w_max = w_max assert inst_subsample > 0 and inst_subsample <= 1. self.inst_subsample = inst_subsample self.inst_subsample_replacement = inst_subsample_replacement self.inst_splitter = inst_splitter assert model_subsample > 0 assert (type(model_subsample) == int) or (model_subsample <= 1.) self.model_subsample = model_subsample self.model_subsample_replacement = model_subsample_replacement self.bagging_size = bagging_size self.init_top_k = init_top_k self.epsilon = epsilon self.multiprocessing = multiprocessing self.multiprocessing_num_cores = multiprocessing_num_cores self.enable_extreme = enable_extreme self.random_seed = random_seed logname = "ensemble_selection_%s.log"%time_utils._timestamp() self.logger = logging_utils._get_logger(config.LOG_DIR, logname) self.n_models = len(self.model_list)
Example #25
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 5 votes |
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] obs_fields = ["search_term", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
Example #26
Source File: feature_intersect_count.py From kaggle-HomeDepot with MIT License | 5 votes |
def main(): logname = "generate_feature_intersect_count_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [ IntersectCount_Ngram, IntersectRatio_Ngram, CooccurrenceCount_Ngram, CooccurrenceRatio_Ngram, ] obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] ) ngrams = [1,2,3,12,123][:3] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #27
Source File: feature_stat_cooc_tfidf.py From kaggle-HomeDepot with MIT License | 4 votes |
def main(which): logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp()) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [] if which == "tf": generators.append( StatCoocTF_Ngram ) elif which == "norm_tf": generators.append( StatCoocNormTF_Ngram ) elif which == "tfidf": generators.append( StatCoocTFIDF_Ngram ) elif which == "norm_tfidf": generators.append( StatCoocNormTFIDF_Ngram ) elif which == "bm25": generators.append( StatCoocBM25_Ngram ) obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) ## document in query obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] ) target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] ) ngrams = [1,2,3,12,123][:3] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term_product_name"] ) target_fields_list.append( ["product_title_product_name"] ) ngrams = [1,2] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: if ngram == 2: # since product_name is of length 2, it makes no difference # for various aggregation as there is only one item param_list = [ngram, "mean"] else: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()
Example #28
Source File: feature_query_quality.py From kaggle-HomeDepot with MIT License | 4 votes |
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
Example #29
Source File: feature_basic.py From kaggle-HomeDepot with MIT License | 4 votes |
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] ngrams = [1,2,3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
Example #30
Source File: feature_doc2vec.py From kaggle-HomeDepot with MIT License | 4 votes |
def main(): logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) #### NOTE: use data BEFORE STEMMING dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) doc2vec_model_dirs = [] model_prefixes = [] ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) ) model_prefixes.append( "Homedepot" ) for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes): ## load model try: if ".bin" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True) if ".txt" in doc2vec_model_dir: doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False) else: doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir) doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label") except: continue # ## standalone (not used in model building) # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"] # generator = Doc2Vec_Vector # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) # sf.go() ## pairwise generators = [ Doc2Vec_CosineSim, Doc2Vec_RMSE, # Doc2Vec_Vdiff, ] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt"][:1] ) target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go()