Python tqdm.tqdm() Examples
The following are 30
code examples of tqdm.tqdm().
Example #1
Source File: From comet-commonsense with Apache License 2.0 | 8 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #2
Source File: From pruning_yolov3 with GNU General Public License v3.0 | 8 votes |
def convert_images2bmp(): # cv2.imread() jpg at 230 img/s, *.bmp at 400 img/s for path in ['../coco/images/val2014/', '../coco/images/train2014/']: folder = os.sep + Path(path).name output = path.replace(folder, folder + 'bmp') if os.path.exists(output): shutil.rmtree(output) # delete output folder os.makedirs(output) # make new output folder for f in tqdm(glob.glob('%s*.jpg' % path)): save_name = f.replace('.jpg', '.bmp').replace(folder, folder + 'bmp') cv2.imwrite(save_name, cv2.imread(f)) for label_path in ['../coco/trainvalno5k.txt', '../coco/5k.txt']: with open(label_path, 'r') as file: lines = lines = lines.replace('2014/', '2014bmp/').replace('.jpg', '.bmp').replace( '/Users/glennjocher/PycharmProjects/', '../') with open(label_path.replace('5k', '5k_bmp'), 'w') as file: file.write(lines)
Example #3
Source File: From post--memorization-in-rnns with MIT License | 7 votes |
def save_tfrecord(filename, dataset, verbose=False): observations = len(dataset['length']) serialized = [] with Pool(processes=4) as pool: for serialized_string in tqdm(pool.imap( tfrecord_serializer, zip(dataset['length'], dataset['source'], dataset['target']), chunksize=10 ), total=observations, disable=not verbose): serialized.append(serialized_string) # Save seriealized dataset writer = tf.python_io.TFRecordWriter( filename, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB ) ) for serialized_string in tqdm(serialized, disable=not verbose): writer.write(serialized_string) writer.close()
Example #4
Source File: From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def load_embedding(self, f, reset=[]): vectors = {} for line in tqdm(f.readlines(), desc='Loading embeddings'): tokens = line.rstrip('\n').split(' ') word = tokens[0].lower() if self.lower else tokens[0] if self.include_unseen: self.add(word) if word in self.tok2idx: vectors[word] = [float(x) for x in tokens[1:]] dim = len(vectors.values()[0]) def to_vector(tok): if tok in vectors and tok not in reset: return vectors[tok] elif tok not in vectors: return np.random.normal(-0.05, 0.05, size=dim) else: return [0.0]*dim self.embed = mx.nd.array([vectors[tok] if tok in vectors and tok not in reset else [0.0]*dim for tok in self.idx2tok])
Example #5
Source File: From Pytorch-Project-Template with MIT License | 6 votes |
def train(self): """ Training loop based on the number of episodes :return: """ for episode in tqdm(range(self.current_episode, self.config.num_episodes)): self.current_episode = episode # reset environment self.env.reset() self.train_one_epoch() # The target network has its weights kept frozen most of the time if self.current_episode % self.config.target_update == 0: self.target_model.load_state_dict(self.policy_model.state_dict()) self.env.render() self.env.close()
Example #6
Source File: From pytorch_NER_BiLSTM_CNN_CRF with Apache License 2.0 | 6 votes |
def _read_file(path): """ :param path: embed file path :return: """ embed_dict = {} with open(path, encoding='utf-8') as f: lines = f.readlines() lines = tqdm.tqdm(lines) for line in lines: values = line.strip().split(' ') if len(values) == 1 or len(values) == 2 or len(values) == 3: continue w, v = values[0], values[1:] embed_dict[w] = v return embed_dict
Example #7
Source File: From tpu_pretrain with Apache License 2.0 | 6 votes |
def input_file_to_training_data(args, input_file, epoch, tokenizer, num_files): print(input_file) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with open(input_file) as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") for i in range(args.epochs_to_generate): create_training_file(docs, tokenizer, args, epoch + i * num_files)
Example #8
Source File: From pruning_yolov3 with GNU General Public License v3.0 | 6 votes |
def coco_single_class_labels(path='../coco/labels/train2014/', label_class=43): # Makes single-class coco datasets. from utils.utils import *; coco_single_class_labels() if os.path.exists('new/'): shutil.rmtree('new/') # delete output folder os.makedirs('new/') # make new output folder os.makedirs('new/labels/') os.makedirs('new/images/') for file in tqdm(sorted(glob.glob('%s/*.*' % path))): with open(file, 'r') as f: labels = np.array([x.split() for x in], dtype=np.float32) i = labels[:, 0] == label_class if any(i): img_file = file.replace('labels', 'images').replace('txt', 'jpg') labels[:, 0] = 0 # reset class to 0 with open('new/images.txt', 'a') as f: # add image to dataset list f.write(img_file + '\n') with open('new/labels/' + Path(file).name, 'a') as f: # write label for l in labels[i]: f.write('%g %.6f %.6f %.6f %.6f\n' % tuple(l)) shutil.copyfile(src=img_file, dst='new/images/' + Path(file).name.replace('txt', 'jpg')) # copy images
Example #9
Source File: From Image-Caption-Generator with MIT License | 6 votes |
def extract_features(path, model_type): if model_type == 'inceptionv3': from keras.applications.inception_v3 import preprocess_input target_size = (299, 299) elif model_type == 'vgg16': from keras.applications.vgg16 import preprocess_input target_size = (224, 224) # Get CNN Model from model = CNNModel(model_type) features = dict() # Extract features from each photo for name in tqdm(os.listdir(path)): # Loading and resizing image filename = path + name image = load_img(filename, target_size=target_size) # Convert the image pixels to a numpy array image = img_to_array(image) # Reshape data for the model image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2])) # Prepare the image for the CNN Model model image = preprocess_input(image) # Pass image into model to get encoded features feature = model.predict(image, verbose=0) # Store encoded features for the image image_id = name.split('.')[0] features[image_id] = feature return features
Example #10
Source File: From pruning_yolov3 with GNU General Public License v3.0 | 6 votes |
def crop_images_random(path='../images/', scale=0.50): # from utils.utils import *; crop_images_random() # crops images into random squares up to scale fraction # WARNING: overwrites images! for file in tqdm(sorted(glob.glob('%s/*.*' % path))): img = cv2.imread(file) # BGR if img is not None: h, w = img.shape[:2] # create random mask a = 30 # minimum size (pixels) mask_h = random.randint(a, int(max(a, h * scale))) # mask height mask_w = mask_h # mask width # box xmin = max(0, random.randint(0, w) - mask_w // 2) ymin = max(0, random.randint(0, h) - mask_h // 2) xmax = min(w, xmin + mask_w) ymax = min(h, ymin + mask_h) # apply random color mask cv2.imwrite(file, img[ymin:ymax, xmin:xmax])
Example #11
Source File: From Deep_VoiceChanger with MIT License | 6 votes |
def auto_inverse(self, whole_spectrum): whole_spectrum = np.copy(whole_spectrum).astype(complex) whole_spectrum[whole_spectrum < 1] = 1 overwrap = self.buffer_size * 2 height = whole_spectrum.shape[0] parallel_dif = (height-overwrap) // self.parallel if height < self.parallel*overwrap: raise Exception('voice length is too small to use gpu, or parallel number is too big') spec = [self.inverse(whole_spectrum[range(i, i+parallel_dif*self.parallel, parallel_dif), :]) for i in tqdm.tqdm(range(parallel_dif+overwrap))] spec = spec[overwrap:] spec = np.concatenate(spec, axis=1) spec = spec.reshape(-1, self.wave_len) #Below code don't consider wave_len and wave_dif, I'll fix. wave = np.fft.ifft(spec, axis=1).real pad = np.zeros((wave.shape[0], 2), dtype=float) wave = np.concatenate([wave, pad], axis=1) dst = np.zeros((wave.shape[0]+3)*self.wave_dif, dtype=float) for i in range(4): w = wave[range(i, wave.shape[0], 4),:] w = w.reshape(-1) dst[i*self.wave_dif:i*self.wave_dif+len(w)] += w return dst*0.5
Example #12
Source File: From Kaggler with MIT License | 6 votes |
def test(): data = np.random.randint(0, 1000, size=(N_OBS, N_FEATURE)) y = np.random.randint(2, size=N_OBS) train = data[0:N_OBS // 2] ytrain = y[0:N_OBS // 2] test = data[N_OBS // 2:] ytest = y[N_OBS // 2:] learner = ClassificationTree(number_of_features=N_FEATURE) for t, x in enumerate(tqdm(train)): learner.update(x, ytrain[t]) correct_num = 0 for t, x in enumerate(tqdm(test)): y_pred = learner.predict(x) if y_pred == ytest[t]: correct_num += 1 print(correct_num)
Example #13
Source File: From treelstm.pytorch with MIT License | 6 votes |
def test(self, dataset): self.model.eval() with torch.no_grad(): total_loss = 0.0 predictions = torch.zeros(len(dataset), dtype=torch.float, device='cpu') indices = torch.arange(1, dataset.num_classes + 1, dtype=torch.float, device='cpu') for idx in tqdm(range(len(dataset)), desc='Testing epoch ' + str(self.epoch) + ''): ltree, linput, rtree, rinput, label = dataset[idx] target = utils.map_label_to_target(label, dataset.num_classes) linput, rinput =, target = output = self.model(ltree, linput, rtree, rinput) loss = self.criterion(output, target) total_loss += loss.item() output = output.squeeze().to('cpu') predictions[idx] =, torch.exp(output)) return total_loss / len(dataset), predictions
Example #14
Source File: From post--memorization-in-rnns with MIT License | 6 votes |
def save_tfrecord(filename, dataset, verbose=False): observations = len(dataset['length']) serialized = [] with Pool(processes=4) as pool: for serialized_string in tqdm(pool.imap( tfrecord_serializer, zip(dataset['length'], dataset['source'], dataset['target']), chunksize=10 ), total=observations, disable=not verbose): serialized.append(serialized_string) # Save seriealized dataset writer = tf.python_io.TFRecordWriter( filename, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB ) ) for serialized_string in tqdm(serialized, disable=not verbose): writer.write(serialized_string) writer.close()
Example #15
Source File: From git2net with GNU Affero General Public License v3.0 | 6 votes |
def _process_repo_serial(git_repo_dir, sqlite_db_file, commits, extraction_settings): """ Processes all commits in a given git repository in a serial manner. Args: git_repo_dir: path to the git repository that is mined sqlite_db_file: path (including database name) where the sqlite database will be created commits: list of commits that have to be processed extraction_settings: settings for the extraction Returns: sqlite database will be written at specified location """ git_repo = pydriller.GitRepository(git_repo_dir) con = sqlite3.connect(sqlite_db_file) for commit in tqdm(commits, desc='Serial'): args = {'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings} result = _process_commit(args) if not result['edits'].empty: result['edits'].to_sql('edits', con, if_exists='append', index=False) if not result['commit'].empty: result['commit'].to_sql('commits', con, if_exists='append', index=False)
Example #16
Source File: From python-pool-performance with MIT License | 6 votes |
def run_test(work_type: FunctionType, job_sets: Sequence, trials: int, pool_class: type, worker_count: int) -> Mapping: pool = pool_class(worker_count) if work_type == 'compute': test_func = pool.run_compute_test elif work_type == 'network': test_func = pool.run_network_test else: raise Exception("Invalid work type: {}".format(work_type)) results = map( lambda jobs: test_func(jobs, trials, show_progress=True), tqdm(job_sets, desc=pool_class.__name__), ) summarized_results = list(map(summarize_test, results)) pool.destroy_pool() return summarized_results
Example #17
Source File: From treelstm.pytorch with MIT License | 6 votes |
def train(self, dataset): self.model.train() self.optimizer.zero_grad() total_loss = 0.0 indices = torch.randperm(len(dataset), dtype=torch.long, device='cpu') for idx in tqdm(range(len(dataset)), desc='Training epoch ' + str(self.epoch + 1) + ''): ltree, linput, rtree, rinput, label = dataset[indices[idx]] target = utils.map_label_to_target(label, dataset.num_classes) linput, rinput =, target = output = self.model(ltree, linput, rtree, rinput) loss = self.criterion(output, target) total_loss += loss.item() loss.backward() if idx % self.args.batchsize == 0 and idx > 0: self.optimizer.step() self.optimizer.zero_grad() self.epoch += 1 return total_loss / len(dataset) # helper function for testing
Example #18
Source File: From cs294-112_hws with MIT License | 5 votes |
def create_visualization(self, dirname, density=False): for s in os.listdir(dirname): for i in tqdm(range(100)): self.visualize(None, i, os.path.join(dirname, s)) self.create_gif(os.path.join(dirname, str(s)))
Example #19
Source File: From easy-faster-rcnn.pytorch with MIT License | 5 votes |
def evaluate(self, model: Model) -> Tuple[float, str]: all_image_ids, all_detection_bboxes, all_detection_classes, all_detection_probs = [], [], [], [] with torch.no_grad(): for _, (image_id_batch, image_batch, scale_batch, _, _) in enumerate(tqdm(self._dataloader)): image_batch = image_batch.cuda() assert image_batch.shape[0] == 1, 'do not use batch size more than 1 on evaluation' detection_bboxes, detection_classes, detection_probs, detection_batch_indices = \ model.eval().forward(image_batch) scale_batch = scale_batch[detection_batch_indices].unsqueeze(dim=-1).expand_as(detection_bboxes).to(device=detection_bboxes.device) detection_bboxes = detection_bboxes / scale_batch kept_indices = (detection_probs > 0.05).nonzero().view(-1) detection_bboxes = detection_bboxes[kept_indices] detection_classes = detection_classes[kept_indices] detection_probs = detection_probs[kept_indices] detection_batch_indices = detection_batch_indices[kept_indices] all_detection_bboxes.extend(detection_bboxes.tolist()) all_detection_classes.extend(detection_classes.tolist()) all_detection_probs.extend(detection_probs.tolist()) all_image_ids.extend([image_id_batch[i] for i in detection_batch_indices]) mean_ap, detail = self._dataset.evaluate(self._path_to_results_dir, all_image_ids, all_detection_bboxes, all_detection_classes, all_detection_probs) return mean_ap, detail
Example #20
Source File: From dockerfiles with Apache License 2.0 | 5 votes |
def train(epochs): for epoch in range(epochs): for (batch, (images, labels)) in tqdm(enumerate(dataset)): train_step(images, labels) print ('Epoch {} finished'.format(epoch))
Example #21
Source File: From lirpg with MIT License | 5 votes |
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list)/len(len_list) avg_ret = sum(ret_list)/len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret # Sample one trajectory (until trajectory end)
Example #22
Source File: From post--memorization-in-rnns with MIT License | 5 votes |
def download(*args, desc=None, **kwargs): last_b = [0] def _download_hook(b=1, bsize=1, tsize=None): if tsize is not None: = tsize t.update((b - last_b[0]) * bsize) last_b[0] = b with tqdm(unit='B', unit_scale=True, miniters=1, desc=desc) as t: urllib.request.urlretrieve(*args, reporthook=_download_hook, data=None, **kwargs)
Example #23
Source File: From post--memorization-in-rnns with MIT License | 5 votes |
def make_source_target_alignment(words, char_map, max_length, verbose=False): space_char_code = char_map[' '] source = [] source_current = [] target = [] target_current = [] length = [] length_current = 0 for word in tqdm(words, disable=not verbose): if length_current + len(word) + 1 > max_length: # concatenate current data and move it to storage source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) # prepear for new source and target source_current = [] target_current = [] length_current = 0 # add source and target, while maintaining the current total length source_current.append( np.array([space_char_code] + [char_map[char] for char in word], dtype='int32') ) target_current.append( np.array([char_map[char] for char in word] + [space_char_code], dtype='int32') ) length_current += 1 + len(word) # concatenate remaning data and move it to storage if length_current > 0: source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) return (length, source, target)
Example #24
Source File: From lirpg with MIT License | 5 votes |
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square( var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
Example #25
Source File: From torch-toolbox with BSD 3-Clause "New" or "Revised" License | 5 votes |
def generate_lmdb_dataset( data_set: Dataset, save_dir: str, name: str, num_workers=0, max_size_rate=1.0, write_frequency=5000): data_loader = DataLoader( data_set, num_workers=num_workers, collate_fn=lambda x: x) num_samples = len(data_set) check_dir(save_dir) lmdb_path = os.path.join(save_dir, '{}.lmdb'.format(name)) db =, subdir=False, map_size=int(1099511627776 * max_size_rate), readonly=False, meminit=True, map_async=True) txn = db.begin(write=True) for idx, data in enumerate(tqdm(data_loader)): txn.put(get_key(idx), dumps_pyarrow(data[0])) if idx % write_frequency == 0 and idx > 0: txn.commit() txn = db.begin(write=True) txn.put(b'__len__', dumps_pyarrow(num_samples)) try: classes = data_set.classes class_to_idx = data_set.class_to_idx txn.put(b'classes', dumps_pyarrow(classes)) txn.put(b'class_to_idx', dumps_pyarrow(class_to_idx)) except AttributeError: pass txn.commit() db.sync() db.close()
Example #26
Source File: From tpu_pretrain with Apache License 2.0 | 5 votes |
def sync(local_dir, keypath, bucket): try: my_bucket = s3_resource.Bucket(bucket) for path, subdirs, files in os.walk(local_dir): path = path.replace("\\","/") directory_name = keypath for file in tqdm(files): my_bucket.upload_file(os.path.join(path, file), f"{keypath}/{file}") except Exception as err: print(err)
Example #27
Source File: From OpenNRE with MIT License | 5 votes |
def eval_model(self, eval_loader): self.model.eval() with torch.no_grad(): t = tqdm(eval_loader) pred_result = [] for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] bag_name = data[1] scope = data[2] args = data[3:] logits = self.model(None, scope, *args, train=False, bag_size=self.bag_size) # results after softmax logits = logits.cpu().numpy() for i in range(len(logits)): for relid in range(self.model.module.num_class): if self.model.module.id2rel[relid] != 'NA': pred_result.append({ 'entpair': bag_name[i][:2], 'relation': self.model.module.id2rel[relid], 'score': logits[i][relid] }) result = eval_loader.dataset.eval(pred_result) return result
Example #28
Source File: From OpenNRE with MIT License | 5 votes |
def eval_model(self, eval_loader): self.eval() avg_acc = AverageMeter() pred_result = [] with torch.no_grad(): t = tqdm(eval_loader) for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] args = data[1:] logits = self.parallel_model(*args) score, pred = logits.max(-1) # (B) # Save result for i in range(pred.size(0)): pred_result.append(pred[i].item()) # Log acc = float((pred == label).long().sum()) / label.size(0) avg_acc.update(acc, pred.size(0)) t.set_postfix(acc=avg_acc.avg) result = eval_loader.dataset.eval(pred_result) return result
Example #29
Source File: From git2net with GNU Affero General Public License v3.0 | 5 votes |
def _process_repo_parallel(git_repo_dir, sqlite_db_file, commits, extraction_settings): """ Processes all commits in a given git repository in a parallel manner. Args: git_repo_dir: path to the git repository that is mined sqlite_db_file: path (including database name) where the sqlite database will be created commits: list of commits that are already in the database extraction_settings: settings for the extraction Returns: sqlite database will be written at specified location """ args = [{'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings} for commit in commits] # suggestion by marco-c ( def _init(git_repo_dir, git_init_lock_): global git_init_lock git_init_lock = git_init_lock_ con = sqlite3.connect(sqlite_db_file) with multiprocessing.Pool(extraction_settings['no_of_processes'], initializer=_init, initargs=(git_repo_dir,git_init_lock)) as p: with tqdm(total=len(args), desc='Parallel ({0} processes)' \ .format(extraction_settings['no_of_processes'])) as pbar: for result in p.imap_unordered(_process_commit, args, chunksize=extraction_settings['chunksize']): if not result['edits'].empty: result['edits'].to_sql('edits', con, if_exists='append', index=False) if not result['commit'].empty: result['commit'].to_sql('commits', con, if_exists='append', index=False) pbar.update(1)
Example #30
Source File: From git2net with GNU Affero General Public License v3.0 | 5 votes |
def identify_file_renaming(git_repo_dir): """ Identifies all names and locations different files in a repository have had. Args: git_repo_dir: path to the git repository that is mined Returns: dag: pathpy DAG object depicting the renaming process aliases: dictionary containing all aliases for all files """ # TODO: Consider corner case where file is renamed and new file with old name is created. git_repo = pydriller.GitRepository(git_repo_dir) dag = pp.DAG() for commit in tqdm(list(git_repo.get_list_commits()), desc='Creating DAG'): for modification in commit.modifications: if (modification.new_path not in dag.nodes) and \ (modification.old_path == modification.new_path) and \ (modification.change_type == pydriller.domain.commit.ModificationType.ADD): if modification.new_path not in dag.nodes: dag.add_node(modification.new_path) elif modification.old_path != modification.new_path: if pd.isnull(modification.old_path): if modification.new_path not in dag.nodes: dag.add_node(modification.new_path) elif pd.isnull(modification.new_path): pass else: dag.add_edge(modification.new_path, modification.old_path) dag.make_acyclic() nodes = [k for k, v in dag.nodes.items() if v['indegree'] == 0 and not v['outdegree'] == 0] aliases = {z: y[-1] for x in nodes for y in dag.routes_from_node(x) for z in y[:-1]} return dag, aliases