Python tqdm.tqdm() Examples
The following are 30
code examples of tqdm.tqdm().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tqdm
, or try the search function
.
Example #1
Source File: utils.py From comet-commonsense with Apache License 2.0 | 8 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #2
Source File: datasets.py From pruning_yolov3 with GNU General Public License v3.0 | 8 votes |
def convert_images2bmp(): # cv2.imread() jpg at 230 img/s, *.bmp at 400 img/s for path in ['../coco/images/val2014/', '../coco/images/train2014/']: folder = os.sep + Path(path).name output = path.replace(folder, folder + 'bmp') if os.path.exists(output): shutil.rmtree(output) # delete output folder os.makedirs(output) # make new output folder for f in tqdm(glob.glob('%s*.jpg' % path)): save_name = f.replace('.jpg', '.bmp').replace(folder, folder + 'bmp') cv2.imwrite(save_name, cv2.imread(f)) for label_path in ['../coco/trainvalno5k.txt', '../coco/5k.txt']: with open(label_path, 'r') as file: lines = file.read() lines = lines.replace('2014/', '2014bmp/').replace('.jpg', '.bmp').replace( '/Users/glennjocher/PycharmProjects/', '../') with open(label_path.replace('5k', '5k_bmp'), 'w') as file: file.write(lines)
Example #3
Source File: generate.py From post--memorization-in-rnns with MIT License | 7 votes |
def save_tfrecord(filename, dataset, verbose=False): observations = len(dataset['length']) serialized = [] with Pool(processes=4) as pool: for serialized_string in tqdm(pool.imap( tfrecord_serializer, zip(dataset['length'], dataset['source'], dataset['target']), chunksize=10 ), total=observations, disable=not verbose): serialized.append(serialized_string) # Save seriealized dataset writer = tf.python_io.TFRecordWriter( filename, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB ) ) for serialized_string in tqdm(serialized, disable=not verbose): writer.write(serialized_string) writer.close()
Example #4
Source File: dataset.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def load_embedding(self, f, reset=[]): vectors = {} for line in tqdm(f.readlines(), desc='Loading embeddings'): tokens = line.rstrip('\n').split(' ') word = tokens[0].lower() if self.lower else tokens[0] if self.include_unseen: self.add(word) if word in self.tok2idx: vectors[word] = [float(x) for x in tokens[1:]] dim = len(vectors.values()[0]) def to_vector(tok): if tok in vectors and tok not in reset: return vectors[tok] elif tok not in vectors: return np.random.normal(-0.05, 0.05, size=dim) else: return [0.0]*dim self.embed = mx.nd.array([vectors[tok] if tok in vectors and tok not in reset else [0.0]*dim for tok in self.idx2tok])
Example #5
Source File: dqn.py From Pytorch-Project-Template with MIT License | 6 votes |
def train(self): """ Training loop based on the number of episodes :return: """ for episode in tqdm(range(self.current_episode, self.config.num_episodes)): self.current_episode = episode # reset environment self.env.reset() self.train_one_epoch() # The target network has its weights kept frozen most of the time if self.current_episode % self.config.target_update == 0: self.target_model.load_state_dict(self.policy_model.state_dict()) self.env.render() self.env.close()
Example #6
Source File: Embed.py From pytorch_NER_BiLSTM_CNN_CRF with Apache License 2.0 | 6 votes |
def _read_file(path): """ :param path: embed file path :return: """ embed_dict = {} with open(path, encoding='utf-8') as f: lines = f.readlines() lines = tqdm.tqdm(lines) for line in lines: values = line.strip().split(' ') if len(values) == 1 or len(values) == 2 or len(values) == 3: continue w, v = values[0], values[1:] embed_dict[w] = v return embed_dict
Example #7
Source File: pregenerate_training_data.py From tpu_pretrain with Apache License 2.0 | 6 votes |
def input_file_to_training_data(args, input_file, epoch, tokenizer, num_files): print(input_file) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with open(input_file) as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") for i in range(args.epochs_to_generate): create_training_file(docs, tokenizer, args, epoch + i * num_files)
Example #8
Source File: utils.py From pruning_yolov3 with GNU General Public License v3.0 | 6 votes |
def coco_single_class_labels(path='../coco/labels/train2014/', label_class=43): # Makes single-class coco datasets. from utils.utils import *; coco_single_class_labels() if os.path.exists('new/'): shutil.rmtree('new/') # delete output folder os.makedirs('new/') # make new output folder os.makedirs('new/labels/') os.makedirs('new/images/') for file in tqdm(sorted(glob.glob('%s/*.*' % path))): with open(file, 'r') as f: labels = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) i = labels[:, 0] == label_class if any(i): img_file = file.replace('labels', 'images').replace('txt', 'jpg') labels[:, 0] = 0 # reset class to 0 with open('new/images.txt', 'a') as f: # add image to dataset list f.write(img_file + '\n') with open('new/labels/' + Path(file).name, 'a') as f: # write label for l in labels[i]: f.write('%g %.6f %.6f %.6f %.6f\n' % tuple(l)) shutil.copyfile(src=img_file, dst='new/images/' + Path(file).name.replace('txt', 'jpg')) # copy images
Example #9
Source File: preprocessing.py From Image-Caption-Generator with MIT License | 6 votes |
def extract_features(path, model_type): if model_type == 'inceptionv3': from keras.applications.inception_v3 import preprocess_input target_size = (299, 299) elif model_type == 'vgg16': from keras.applications.vgg16 import preprocess_input target_size = (224, 224) # Get CNN Model from model.py model = CNNModel(model_type) features = dict() # Extract features from each photo for name in tqdm(os.listdir(path)): # Loading and resizing image filename = path + name image = load_img(filename, target_size=target_size) # Convert the image pixels to a numpy array image = img_to_array(image) # Reshape data for the model image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) # Prepare the image for the CNN Model model image = preprocess_input(image) # Pass image into model to get encoded features feature = model.predict(image, verbose=0) # Store encoded features for the image image_id = name.split('.')[0] features[image_id] = feature return features
Example #10
Source File: utils.py From pruning_yolov3 with GNU General Public License v3.0 | 6 votes |
def crop_images_random(path='../images/', scale=0.50): # from utils.utils import *; crop_images_random() # crops images into random squares up to scale fraction # WARNING: overwrites images! for file in tqdm(sorted(glob.glob('%s/*.*' % path))): img = cv2.imread(file) # BGR if img is not None: h, w = img.shape[:2] # create random mask a = 30 # minimum size (pixels) mask_h = random.randint(a, int(max(a, h * scale))) # mask height mask_w = mask_h # mask width # box xmin = max(0, random.randint(0, w) - mask_w // 2) ymin = max(0, random.randint(0, h) - mask_h // 2) xmax = min(w, xmin + mask_w) ymax = min(h, ymin + mask_h) # apply random color mask cv2.imwrite(file, img[ymin:ymax, xmin:xmax])
Example #11
Source File: gla_gpu.py From Deep_VoiceChanger with MIT License | 6 votes |
def auto_inverse(self, whole_spectrum): whole_spectrum = np.copy(whole_spectrum).astype(complex) whole_spectrum[whole_spectrum < 1] = 1 overwrap = self.buffer_size * 2 height = whole_spectrum.shape[0] parallel_dif = (height-overwrap) // self.parallel if height < self.parallel*overwrap: raise Exception('voice length is too small to use gpu, or parallel number is too big') spec = [self.inverse(whole_spectrum[range(i, i+parallel_dif*self.parallel, parallel_dif), :]) for i in tqdm.tqdm(range(parallel_dif+overwrap))] spec = spec[overwrap:] spec = np.concatenate(spec, axis=1) spec = spec.reshape(-1, self.wave_len) #Below code don't consider wave_len and wave_dif, I'll fix. wave = np.fft.ifft(spec, axis=1).real pad = np.zeros((wave.shape[0], 2), dtype=float) wave = np.concatenate([wave, pad], axis=1) dst = np.zeros((wave.shape[0]+3)*self.wave_dif, dtype=float) for i in range(4): w = wave[range(i, wave.shape[0], 4),:] w = w.reshape(-1) dst[i*self.wave_dif:i*self.wave_dif+len(w)] += w return dst*0.5
Example #12
Source File: test_classification_tree.py From Kaggler with MIT License | 6 votes |
def test(): data = np.random.randint(0, 1000, size=(N_OBS, N_FEATURE)) y = np.random.randint(2, size=N_OBS) train = data[0:N_OBS // 2] ytrain = y[0:N_OBS // 2] test = data[N_OBS // 2:] ytest = y[N_OBS // 2:] learner = ClassificationTree(number_of_features=N_FEATURE) for t, x in enumerate(tqdm(train)): learner.update(x, ytrain[t]) correct_num = 0 for t, x in enumerate(tqdm(test)): y_pred = learner.predict(x) if y_pred == ytest[t]: correct_num += 1 print(correct_num)
Example #13
Source File: trainer.py From treelstm.pytorch with MIT License | 6 votes |
def test(self, dataset): self.model.eval() with torch.no_grad(): total_loss = 0.0 predictions = torch.zeros(len(dataset), dtype=torch.float, device='cpu') indices = torch.arange(1, dataset.num_classes + 1, dtype=torch.float, device='cpu') for idx in tqdm(range(len(dataset)), desc='Testing epoch ' + str(self.epoch) + ''): ltree, linput, rtree, rinput, label = dataset[idx] target = utils.map_label_to_target(label, dataset.num_classes) linput, rinput = linput.to(self.device), rinput.to(self.device) target = target.to(self.device) output = self.model(ltree, linput, rtree, rinput) loss = self.criterion(output, target) total_loss += loss.item() output = output.squeeze().to('cpu') predictions[idx] = torch.dot(indices, torch.exp(output)) return total_loss / len(dataset), predictions
Example #14
Source File: autocomplete.py From post--memorization-in-rnns with MIT License | 6 votes |
def save_tfrecord(filename, dataset, verbose=False): observations = len(dataset['length']) serialized = [] with Pool(processes=4) as pool: for serialized_string in tqdm(pool.imap( tfrecord_serializer, zip(dataset['length'], dataset['source'], dataset['target']), chunksize=10 ), total=observations, disable=not verbose): serialized.append(serialized_string) # Save seriealized dataset writer = tf.python_io.TFRecordWriter( filename, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB ) ) for serialized_string in tqdm(serialized, disable=not verbose): writer.write(serialized_string) writer.close()
Example #15
Source File: extraction.py From git2net with GNU Affero General Public License v3.0 | 6 votes |
def _process_repo_serial(git_repo_dir, sqlite_db_file, commits, extraction_settings): """ Processes all commits in a given git repository in a serial manner. Args: git_repo_dir: path to the git repository that is mined sqlite_db_file: path (including database name) where the sqlite database will be created commits: list of commits that have to be processed extraction_settings: settings for the extraction Returns: sqlite database will be written at specified location """ git_repo = pydriller.GitRepository(git_repo_dir) con = sqlite3.connect(sqlite_db_file) for commit in tqdm(commits, desc='Serial'): args = {'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings} result = _process_commit(args) if not result['edits'].empty: result['edits'].to_sql('edits', con, if_exists='append', index=False) if not result['commit'].empty: result['commit'].to_sql('commits', con, if_exists='append', index=False)
Example #16
Source File: pools.py From python-pool-performance with MIT License | 6 votes |
def run_test(work_type: FunctionType, job_sets: Sequence, trials: int, pool_class: type, worker_count: int) -> Mapping: pool = pool_class(worker_count) if work_type == 'compute': test_func = pool.run_compute_test elif work_type == 'network': test_func = pool.run_network_test else: raise Exception("Invalid work type: {}".format(work_type)) results = map( lambda jobs: test_func(jobs, trials, show_progress=True), tqdm(job_sets, desc=pool_class.__name__), ) summarized_results = list(map(summarize_test, results)) pool.destroy_pool() return summarized_results
Example #17
Source File: trainer.py From treelstm.pytorch with MIT License | 6 votes |
def train(self, dataset): self.model.train() self.optimizer.zero_grad() total_loss = 0.0 indices = torch.randperm(len(dataset), dtype=torch.long, device='cpu') for idx in tqdm(range(len(dataset)), desc='Training epoch ' + str(self.epoch + 1) + ''): ltree, linput, rtree, rinput, label = dataset[indices[idx]] target = utils.map_label_to_target(label, dataset.num_classes) linput, rinput = linput.to(self.device), rinput.to(self.device) target = target.to(self.device) output = self.model(ltree, linput, rtree, rinput) loss = self.criterion(output, target) total_loss += loss.item() loss.backward() if idx % self.args.batchsize == 0 and idx > 0: self.optimizer.step() self.optimizer.zero_grad() self.epoch += 1 return total_loss / len(dataset) # helper function for testing
Example #18
Source File: pointmass.py From cs294-112_hws with MIT License | 5 votes |
def create_visualization(self, dirname, density=False): for s in os.listdir(dirname): for i in tqdm(range(100)): self.visualize(None, i, os.path.join(dirname, s)) self.create_gif(os.path.join(dirname, str(s)))
Example #19
Source File: evaluator.py From easy-faster-rcnn.pytorch with MIT License | 5 votes |
def evaluate(self, model: Model) -> Tuple[float, str]: all_image_ids, all_detection_bboxes, all_detection_classes, all_detection_probs = [], [], [], [] with torch.no_grad(): for _, (image_id_batch, image_batch, scale_batch, _, _) in enumerate(tqdm(self._dataloader)): image_batch = image_batch.cuda() assert image_batch.shape[0] == 1, 'do not use batch size more than 1 on evaluation' detection_bboxes, detection_classes, detection_probs, detection_batch_indices = \ model.eval().forward(image_batch) scale_batch = scale_batch[detection_batch_indices].unsqueeze(dim=-1).expand_as(detection_bboxes).to(device=detection_bboxes.device) detection_bboxes = detection_bboxes / scale_batch kept_indices = (detection_probs > 0.05).nonzero().view(-1) detection_bboxes = detection_bboxes[kept_indices] detection_classes = detection_classes[kept_indices] detection_probs = detection_probs[kept_indices] detection_batch_indices = detection_batch_indices[kept_indices] all_detection_bboxes.extend(detection_bboxes.tolist()) all_detection_classes.extend(detection_classes.tolist()) all_detection_probs.extend(detection_probs.tolist()) all_image_ids.extend([image_id_batch[i] for i in detection_batch_indices]) mean_ap, detail = self._dataset.evaluate(self._path_to_results_dir, all_image_ids, all_detection_bboxes, all_detection_classes, all_detection_probs) return mean_ap, detail
Example #20
Source File: mnist_eager.py From dockerfiles with Apache License 2.0 | 5 votes |
def train(epochs): for epoch in range(epochs): for (batch, (images, labels)) in tqdm(enumerate(dataset)): train_step(images, labels) print ('Epoch {} finished'.format(epoch))
Example #21
Source File: run_mujoco.py From lirpg with MIT License | 5 votes |
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list)/len(len_list) avg_ret = sum(ret_list)/len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret # Sample one trajectory (until trajectory end)
Example #22
Source File: tqdm_download.py From post--memorization-in-rnns with MIT License | 5 votes |
def download(*args, desc=None, **kwargs): last_b = [0] def _download_hook(b=1, bsize=1, tsize=None): if tsize is not None: t.total = tsize t.update((b - last_b[0]) * bsize) last_b[0] = b with tqdm(unit='B', unit_scale=True, miniters=1, desc=desc) as t: urllib.request.urlretrieve(*args, reporthook=_download_hook, data=None, **kwargs)
Example #23
Source File: generate.py From post--memorization-in-rnns with MIT License | 5 votes |
def make_source_target_alignment(words, char_map, max_length, verbose=False): space_char_code = char_map[' '] source = [] source_current = [] target = [] target_current = [] length = [] length_current = 0 for word in tqdm(words, disable=not verbose): if length_current + len(word) + 1 > max_length: # concatenate current data and move it to storage source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) # prepear for new source and target source_current = [] target_current = [] length_current = 0 # add source and target, while maintaining the current total length source_current.append( np.array([space_char_code] + [char_map[char] for char in word], dtype='int32') ) target_current.append( np.array([char_map[char] for char in word] + [space_char_code], dtype='int32') ) length_current += 1 + len(word) # concatenate remaning data and move it to storage if length_current > 0: source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) return (length, source, target)
Example #24
Source File: behavior_clone.py From lirpg with MIT License | 5 votes |
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
Example #25
Source File: convert_lmdb.py From torch-toolbox with BSD 3-Clause "New" or "Revised" License | 5 votes |
def generate_lmdb_dataset( data_set: Dataset, save_dir: str, name: str, num_workers=0, max_size_rate=1.0, write_frequency=5000): data_loader = DataLoader( data_set, num_workers=num_workers, collate_fn=lambda x: x) num_samples = len(data_set) check_dir(save_dir) lmdb_path = os.path.join(save_dir, '{}.lmdb'.format(name)) db = lmdb.open(lmdb_path, subdir=False, map_size=int(1099511627776 * max_size_rate), readonly=False, meminit=True, map_async=True) txn = db.begin(write=True) for idx, data in enumerate(tqdm(data_loader)): txn.put(get_key(idx), dumps_pyarrow(data[0])) if idx % write_frequency == 0 and idx > 0: txn.commit() txn = db.begin(write=True) txn.put(b'__len__', dumps_pyarrow(num_samples)) try: classes = data_set.classes class_to_idx = data_set.class_to_idx txn.put(b'classes', dumps_pyarrow(classes)) txn.put(b'class_to_idx', dumps_pyarrow(class_to_idx)) except AttributeError: pass txn.commit() db.sync() db.close()
Example #26
Source File: sync_checkpoint_to_s3.py From tpu_pretrain with Apache License 2.0 | 5 votes |
def sync(local_dir, keypath, bucket): try: my_bucket = s3_resource.Bucket(bucket) for path, subdirs, files in os.walk(local_dir): path = path.replace("\\","/") directory_name = keypath for file in tqdm(files): my_bucket.upload_file(os.path.join(path, file), f"{keypath}/{file}") except Exception as err: print(err)
Example #27
Source File: bag_re.py From OpenNRE with MIT License | 5 votes |
def eval_model(self, eval_loader): self.model.eval() with torch.no_grad(): t = tqdm(eval_loader) pred_result = [] for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] bag_name = data[1] scope = data[2] args = data[3:] logits = self.model(None, scope, *args, train=False, bag_size=self.bag_size) # results after softmax logits = logits.cpu().numpy() for i in range(len(logits)): for relid in range(self.model.module.num_class): if self.model.module.id2rel[relid] != 'NA': pred_result.append({ 'entpair': bag_name[i][:2], 'relation': self.model.module.id2rel[relid], 'score': logits[i][relid] }) result = eval_loader.dataset.eval(pred_result) return result
Example #28
Source File: sentence_re.py From OpenNRE with MIT License | 5 votes |
def eval_model(self, eval_loader): self.eval() avg_acc = AverageMeter() pred_result = [] with torch.no_grad(): t = tqdm(eval_loader) for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] args = data[1:] logits = self.parallel_model(*args) score, pred = logits.max(-1) # (B) # Save result for i in range(pred.size(0)): pred_result.append(pred[i].item()) # Log acc = float((pred == label).long().sum()) / label.size(0) avg_acc.update(acc, pred.size(0)) t.set_postfix(acc=avg_acc.avg) result = eval_loader.dataset.eval(pred_result) return result
Example #29
Source File: extraction.py From git2net with GNU Affero General Public License v3.0 | 5 votes |
def _process_repo_parallel(git_repo_dir, sqlite_db_file, commits, extraction_settings): """ Processes all commits in a given git repository in a parallel manner. Args: git_repo_dir: path to the git repository that is mined sqlite_db_file: path (including database name) where the sqlite database will be created commits: list of commits that are already in the database extraction_settings: settings for the extraction Returns: sqlite database will be written at specified location """ args = [{'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings} for commit in commits] # suggestion by marco-c (github.com/ishepard/pydriller/issues/110) def _init(git_repo_dir, git_init_lock_): global git_init_lock git_init_lock = git_init_lock_ con = sqlite3.connect(sqlite_db_file) with multiprocessing.Pool(extraction_settings['no_of_processes'], initializer=_init, initargs=(git_repo_dir,git_init_lock)) as p: with tqdm(total=len(args), desc='Parallel ({0} processes)' \ .format(extraction_settings['no_of_processes'])) as pbar: for result in p.imap_unordered(_process_commit, args, chunksize=extraction_settings['chunksize']): if not result['edits'].empty: result['edits'].to_sql('edits', con, if_exists='append', index=False) if not result['commit'].empty: result['commit'].to_sql('commits', con, if_exists='append', index=False) pbar.update(1)
Example #30
Source File: extraction.py From git2net with GNU Affero General Public License v3.0 | 5 votes |
def identify_file_renaming(git_repo_dir): """ Identifies all names and locations different files in a repository have had. Args: git_repo_dir: path to the git repository that is mined Returns: dag: pathpy DAG object depicting the renaming process aliases: dictionary containing all aliases for all files """ # TODO: Consider corner case where file is renamed and new file with old name is created. git_repo = pydriller.GitRepository(git_repo_dir) dag = pp.DAG() for commit in tqdm(list(git_repo.get_list_commits()), desc='Creating DAG'): for modification in commit.modifications: if (modification.new_path not in dag.nodes) and \ (modification.old_path == modification.new_path) and \ (modification.change_type == pydriller.domain.commit.ModificationType.ADD): if modification.new_path not in dag.nodes: dag.add_node(modification.new_path) elif modification.old_path != modification.new_path: if pd.isnull(modification.old_path): if modification.new_path not in dag.nodes: dag.add_node(modification.new_path) elif pd.isnull(modification.new_path): pass else: dag.add_edge(modification.new_path, modification.old_path) dag.make_acyclic() nodes = [k for k, v in dag.nodes.items() if v['indegree'] == 0 and not v['outdegree'] == 0] aliases = {z: y[-1] for x in nodes for y in dag.routes_from_node(x) for z in y[:-1]} return dag, aliases