Python Examples of msgpack.load

Source File: interface.py From simple-effective-text-matching with Apache License 2.0

6 votes

def __init__(self, args, log=None):
        self.args = args
        # build/load vocab and target map
        vocab_file = os.path.join(args.output_dir, 'vocab.txt')
        target_map_file = os.path.join(args.output_dir, 'target_map.txt')
        if not os.path.exists(vocab_file):
            data = load_data(self.args.data_dir)
            self.target_map = Indexer.build((sample['target'] for sample in data), log=log)
            self.target_map.save(target_map_file)
            self.vocab = Vocab.build((word for sample in data
                                      for text in (sample['text1'], sample['text2'])
                                      for word in text.split()[:self.args.max_len]),
                                     lower=args.lower_case, min_df=self.args.min_df, log=log,
                                     pretrained_embeddings=args.pretrained_embeddings,
                                     dump_filtered=os.path.join(args.output_dir, 'filtered_words.txt'))
            self.vocab.save(vocab_file)
        else:
            self.target_map = Indexer.load(target_map_file)
            self.vocab = Vocab.load(vocab_file)
        args.num_classes = len(self.target_map)
        args.num_vocab = len(self.vocab)
        args.padding = Vocab.pad()

Source File: coverage.py From bncov with MIT License

6 votes

def save_to_file(self, filename):
        """Save only the bare minimum needed to reconstruct this CoverageDB.

        This serializes the data to a single file and cab reduce the disk footprint of
        block coverage significantly (depending on overlap and number of files)."""
        if file_backing_disabled:
            raise Exception("[!] Can't save/load coverage db files without msgpack. Try `pip install msgpack`")
        save_dict = dict()
        save_dict["version"] = 1  # serialized covdb version
        save_dict["module_name"] = self.module_name
        save_dict["module_base"] = self.module_base
        save_dict["coverage_files"] = self.coverage_files
        # save tighter version of block dict {int: int} vice {int: str}
        block_dict_to_save = {}
        file_index_map = {filepath: self.coverage_files.index(filepath) for filepath in self.coverage_files}
        for block, trace_list in self.block_dict.items():
            trace_id_list = [file_index_map[name] for name in trace_list]
            block_dict_to_save[block] = trace_id_list
        save_dict["block_dict"] = block_dict_to_save
        # write packed version to file
        with open(filename, "wb") as f:
            msgpack.dump(save_dict, f)
            self.filename = filename

Source File: interface.py From simple-effective-text-matching-pytorch with Apache License 2.0

6 votes

def __init__(self, args, log=None):
        self.args = args
        # build/load vocab and target map
        vocab_file = os.path.join(args.output_dir, 'vocab.txt')
        target_map_file = os.path.join(args.output_dir, 'target_map.txt')
        if not os.path.exists(vocab_file):
            data = load_data(self.args.data_dir)
            self.target_map = Indexer.build((sample['target'] for sample in data), log=log)
            self.target_map.save(target_map_file)
            self.vocab = Vocab.build((word for sample in data
                                      for text in (sample['text1'], sample['text2'])
                                      for word in text.split()[:self.args.max_len]),
                                     lower=args.lower_case, min_df=self.args.min_df, log=log,
                                     pretrained_embeddings=args.pretrained_embeddings,
                                     dump_filtered=os.path.join(args.output_dir, 'filtered_words.txt'))
            self.vocab.save(vocab_file)

        else:
            self.target_map = Indexer.load(target_map_file)
            self.vocab = Vocab.load(vocab_file)
        args.num_classes = len(self.target_map)
        args.num_vocab = len(self.vocab)
        args.padding = Vocab.pad()

Source File: serialization.py From dcase_util with MIT License

5 votes

def load_json(cls, filename):
        """Load JSON file

        Parameters
        ----------
        filename : str
            Filename path

        Returns
        -------
        data

        """

        cls.file_exists(filename=filename)

        try:
            import ujson as json

        except ImportError:
            try:
                import json

            except ImportError:
                message = '{name}: Unable to import json module. You can install it with `pip install ujson`.'.format(
                    name=cls.__class__.__name__
                )

                cls.logger().exception(message)
                raise ImportError(message)

        return json.load(open(filename, "r"))

Source File: train.py From sru with MIT License

5 votes

def load_data(opt):
    with open('SQuAD/meta.msgpack', 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    opt['pretrained_words'] = True
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)
    if not opt['fix_embeddings']:
        embedding[1] = torch.normal(means=torch.zeros(opt['embedding_dim']), std=1.)
    with open(args.data_file, 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    train_orig = pd.read_csv('SQuAD/train.csv')
    dev_orig = pd.read_csv('SQuAD/dev.csv')
    train = list(zip(
        data['trn_context_ids'],
        data['trn_context_features'],
        data['trn_context_tags'],
        data['trn_context_ents'],
        data['trn_question_ids'],
        train_orig['answer_start_token'].tolist(),
        train_orig['answer_end_token'].tolist(),
        data['trn_context_text'],
        data['trn_context_spans']
    ))
    dev = list(zip(
        data['dev_context_ids'],
        data['dev_context_features'],
        data['dev_context_tags'],
        data['dev_context_ents'],
        data['dev_question_ids'],
        data['dev_context_text'],
        data['dev_context_spans']
    ))
    dev_y = dev_orig['answers'].tolist()[:len(dev)]
    dev_y = [eval(y) for y in dev_y]
    return train, dev, dev_y, embedding, opt

Source File: interface.py From simple-effective-text-matching with Apache License 2.0

5 votes

def load_embeddings(self):
        """generate embeddings suited for the current vocab or load previously cached ones."""
        embedding_file = os.path.join(self.args.output_dir, 'embedding.msgpack')
        if not os.path.exists(embedding_file):
            embeddings = load_embeddings(self.args.pretrained_embeddings, self.vocab,
                                         self.args.embedding_dim, mode=self.args.embedding_mode,
                                         lower=self.args.lower_case)
            with open(embedding_file, 'wb') as f:
                msgpack.dump(embeddings, f)
        else:
            with open(embedding_file, 'rb') as f:
                embeddings = msgpack.load(f)
        return embeddings

Source File: predict_CoQA.py From FlowDelta with MIT License

5 votes

def load_dev_data(opt): # can be extended to true test set
    with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    assert opt['embedding_dim'] == embedding.size(1)

    with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
        data = msgpack.load(f, encoding='utf8')

    assert opt['num_features'] == len(data['context_features'][0][0]) + opt['explicit_dialog_ctx'] * 3

    dev = {'context': list(zip(
                        data['context_ids'],
                        data['context_tags'],
                        data['context_ents'],
                        data['context'],
                        data['context_span'],
                        data['1st_question'],
                        data['context_tokenized'])),
           'qa': list(zip(
                        data['question_CID'],
                        data['question_ids'],
                        data['context_features'],
                        data['answer_start'],
                        data['answer_end'],
                        data['rationale_start'],
                        data['rationale_end'],
                        data['answer_choice'],
                        data['question'],
                        data['answer'],
                        data['question_tokenized']))
          }

    return dev, embedding

Source File: train_QuAC.py From FlowDelta with MIT License

5 votes

def load_train_data(opt):
    with open(os.path.join(args.train_dir, 'train_meta.msgpack'), 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)

    with open(os.path.join(args.train_dir, 'train_data.msgpack'), 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    #data_orig = pd.read_csv(os.path.join(args.train_dir, 'train.csv'))

    opt['num_features'] = len(data['context_features'][0][0])

    train = {'context': list(zip(
                        data['context_ids'],
                        data['context_tags'],
                        data['context_ents'],
                        data['context'],
                        data['context_span'],
                        data['1st_question'],
                        data['context_tokenized'])),
             'qa': list(zip(
                        data['question_CID'],
                        data['question_ids'],
                        data['context_features'],
                        data['answer_start'],
                        data['answer_end'],
                        data['answer_choice'],
                        data['question'],
                        data['answer'],
                        data['question_tokenized']))
            }
    return train, embedding, opt

Source File: predict_QuAC.py From FlowDelta with MIT License

5 votes

def load_dev_data(opt): # can be extended to true test set
    with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    assert opt['embedding_dim'] == embedding.size(1)

    with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
        data = msgpack.load(f, encoding='utf8')

    assert opt['num_features'] == len(data['context_features'][0][0]) + opt['explicit_dialog_ctx'] * (opt['use_dialog_act']*3 + 2)
    
    dev = {'context': list(zip(
                        data['context_ids'],
                        data['context_tags'],
                        data['context_ents'],
                        data['context'],
                        data['context_span'],
                        data['1st_question'],
                        data['context_tokenized'])),
           'qa': list(zip(
                        data['question_CID'],
                        data['question_ids'],
                        data['context_features'],
                        data['answer_start'],
                        data['answer_end'],
                        data['answer_choice'],
                        data['question'],
                        data['answer'],
                        data['question_tokenized']))
          }
    
    dev_answer = []
    for i, CID in enumerate(data['question_CID']):
        if len(dev_answer) <= CID:
            dev_answer.append([])
        dev_answer[CID].append(data['all_answer'][i])
    
    return dev, embedding, dev_answer

Source File: train_CoQA.py From FlowDelta with MIT License

5 votes

def load_dev_data(opt): # can be extended to true test set
    with open(os.path.join(args.dev_dir, 'dev_meta.msgpack'), 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    assert opt['embedding_dim'] == embedding.size(1)

    with open(os.path.join(args.dev_dir, 'dev_data.msgpack'), 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    #data_orig = pd.read_csv(os.path.join(args.dev_dir, 'dev.csv'))

    assert opt['num_features'] == len(data['context_features'][0][0])

    dev = {'context': list(zip(
                        data['context_ids'],
                        data['context_tags'],
                        data['context_ents'],
                        data['context'],
                        data['context_span'],
                        data['1st_question'],
                        data['context_tokenized'])),
           'qa': list(zip(
                        data['question_CID'],
                        data['question_ids'],
                        data['context_features'],
                        data['answer_start'],
                        data['answer_end'],
                        data['rationale_start'],
                        data['rationale_end'],
                        data['answer_choice'],
                        data['question'],
                        data['answer'],
                        data['question_tokenized']))
          }

    return dev, embedding

Source File: train_CoQA.py From FlowDelta with MIT License

5 votes

def load_train_data(opt):
    with open(os.path.join(args.train_dir, 'train_meta.msgpack'), 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)

    with open(os.path.join(args.train_dir, 'train_data.msgpack'), 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    #data_orig = pd.read_csv(os.path.join(args.train_dir, 'train.csv'))

    opt['num_features'] = len(data['context_features'][0][0])

    train = {'context': list(zip(
                        data['context_ids'],
                        data['context_tags'],
                        data['context_ents'],
                        data['context'],
                        data['context_span'],
                        data['1st_question'],
                        data['context_tokenized'])),
             'qa': list(zip(
                        data['question_CID'],
                        data['question_ids'],
                        data['context_features'],
                        data['answer_start'],
                        data['answer_end'],
                        data['rationale_start'],
                        data['rationale_end'],
                        data['answer_choice'],
                        data['question'],
                        data['answer'],
                        data['question_tokenized']))
            }
    return train, embedding, opt

Source File: serialization.py From dcase_util with MIT License

5 votes

def load_marshal(cls, filename):
        """Load MARSHAL file

        Parameters
        ----------
        filename : str
            Filename path

        Returns
        -------
        data

        """

        cls.file_exists(filename=filename)

        try:
            import marshal

        except ImportError:
            message = '{name}: Unable to import marshal module. You can install it with `pip install pymarshal`.'.format(
                name=cls.__class__.__name__
            )

            cls.logger().exception(message)
            raise ImportError(message)

        return marshal.load(open(filename, "rb"))

Source File: serialization.py From dcase_util with MIT License

5 votes

def load_msgpack(cls, filename):
        """Load MSGPACK file

        Parameters
        ----------
        filename : str
            Filename path

        Returns
        -------
        data

        """

        cls.file_exists(filename=filename)

        try:
            import msgpack

        except ImportError:
            message = '{name}: Unable to import msgpack module. You can install it with `pip install msgpack-python`.'.format(
                name=cls.__class__.__name__
            )

            cls.logger().exception(message)
            raise ImportError(message)

        return msgpack.load(open(filename, "rb"), encoding='utf-8')

Source File: serialization.py From dcase_util with MIT License

5 votes

def load_cpickle(cls, filename):
        """Load CPICKLE file

        Parameters
        ----------
        filename : str
            Filename path

        Returns
        -------
        data

        """

        cls.file_exists(filename=filename)

        try:
            import cPickle as pickle

        except ImportError:
            try:
                import pickle

            except ImportError:
                message = '{name}: Unable to import pickle module.'.format(
                    name=cls.__class__.__name__
                )

                cls.logger().exception(message)
                raise ImportError(message)

        return pickle.load(open(filename, "rb"))

Source File: CoQAPreprocess.py From SDNet with MIT License

5 votes

def load_data(self):
        print('Load train_meta.msgpack...')
        meta_file_name = os.path.join(self.spacyDir, 'train_meta.msgpack')
        with open(meta_file_name, 'rb') as f:
            meta = msgpack.load(f, encoding='utf8')
        embedding = torch.Tensor(meta['embedding'])
        self.opt['vocab_size'] = embedding.size(0)
        self.opt['vocab_dim'] = embedding.size(1)
        self.opt['char_vocab_size'] = len(meta['char_vocab'])
        return meta['vocab'], meta['char_vocab'], embedding

Source File: reader.py From transit-python with Apache License 2.0

5 votes

def load(self, stream):
        return self.decoder.decode(msgpack.load(stream,
                                                object_pairs_hook=OrderedDict))

Source File: reader.py From transit-python with Apache License 2.0

5 votes

def load(self, stream):
        return self.decoder.decode(json.load(stream,
                                             object_pairs_hook=OrderedDict))

Source File: reader.py From transit-python with Apache License 2.0

5 votes

def read(self, stream):
        """Given a readable file descriptor object (something `load`able by
        msgpack or json), read the data, and return the Python representation
        of the contents. One-shot reader.
        """
        return self.reader.load(stream)

Source File: interface.py From simple-effective-text-matching-pytorch with Apache License 2.0

5 votes

def load_embeddings(self):
        """generate embeddings suited for the current vocab or load previously cached ones."""
        assert self.args.pretrained_embeddings
        embedding_file = os.path.join(self.args.output_dir, 'embedding.msgpack')
        if not os.path.exists(embedding_file):
            embeddings = load_embeddings(self.args.pretrained_embeddings, self.vocab,
                                         self.args.embedding_dim, mode=self.args.embedding_mode,
                                         lower=self.args.lower_case)
            with open(embedding_file, 'wb') as f:
                msgpack.dump(embeddings, f)
        else:
            with open(embedding_file, 'rb') as f:
                embeddings = msgpack.load(f)
        return embeddings

Source File: serialization.py From dcase_util with MIT License

4 votes

def load_yaml(cls, filename):
        """Load YAML file

        Parameters
        ----------
        filename : str
            Filename path

        Returns
        -------
        data

        """

        cls.file_exists(filename=filename)

        try:
            import yaml

        except ImportError:
            message = '{name}: Unable to import YAML module. You can install it with `pip install pyyaml`.'.format(name=cls.__class__.__name__)
            cls.logger().exception(message)
            raise ImportError(message)

        try:
            with open(filename, 'r') as infile:
                return yaml.load(infile, Loader=yaml.FullLoader)

        except yaml.YAMLError as exc:
            cls.logger().error("Error while parsing YAML file [{file}]".format(file=filename))
            if hasattr(exc, 'problem_mark'):
                if exc.context is not None:
                    cls.logger().error(str(exc.problem_mark) + '\n  ' + str(exc.problem) + ' ' + str(exc.context))
                    cls.logger().error('  Please correct data and retry.')

                else:
                    cls.logger().error(str(exc.problem_mark) + '\n  ' + str(exc.problem))
                    cls.logger().error('  Please correct data and retry.')

            else:
                cls.logger().error("Something went wrong while parsing yaml file [{file}]".format(file=filename))

            return

Source File: utils.py From libnacl with Apache License 2.0

4 votes

def load_key(path_or_file, serial='json'):
    '''
    Read in a key from a file and return the applicable key object based on
    the contents of the file
    '''
    if hasattr(path_or_file, 'read'):
        stream = path_or_file
    else:
        if serial == 'json':
            stream = open(path_or_file, 'r')
        else:
            stream = open(path_or_file, 'rb')

    try:
        if serial == 'msgpack':
            import msgpack
            key_data = msgpack.load(stream)
        elif serial == 'json':
            import json
            if sys.version_info[0] >= 3:
                key_data = json.loads(stream.read())
            else:
                key_data = json.loads(stream.read(), encoding='UTF-8')
    finally:
        if stream != path_or_file:
            stream.close()

    if 'priv' in key_data and 'sign' in key_data and 'pub' in key_data:
        return libnacl.dual.DualSecret(
                libnacl.encode.hex_decode(key_data['priv']),
                libnacl.encode.hex_decode(key_data['sign']))
    elif 'priv' in key_data and 'pub' in key_data:
        return libnacl.public.SecretKey(
                libnacl.encode.hex_decode(key_data['priv']))
    elif 'sign' in key_data:
        return libnacl.sign.Signer(
                libnacl.encode.hex_decode(key_data['sign']))
    elif 'pub' in key_data:
        return libnacl.public.PublicKey(
                libnacl.encode.hex_decode(key_data['pub']))
    elif 'verify' in key_data:
        return libnacl.sign.Verifier(key_data['verify'])
    elif 'priv' in key_data:
        return libnacl.secret.SecretBox(
                libnacl.encode.hex_decode(key_data['priv']))
    raise ValueError('Found no key data')

Source File: predict_QuAC.py From FlowDelta with MIT License

4 votes

def main():
    log.info('[program starts.]')
    checkpoint = torch.load(args.model)
    opt = checkpoint['config']
    opt['task_name'] = 'QuAC'
    opt['cuda'] = args.cuda
    opt['seed'] = args.seed
    if opt.get('disperse_flow') is None:
        opt['disperse_flow'] = False
    if opt.get('rationale_lambda') is None:
        opt['rationale_lambda'] = 0.0
    if opt.get('no_dialog_flow') is None:
        opt['no_dialog_flow'] = False
    if opt.get('do_hierarchical_query') is None:
        opt['do_hierarchical_query'] = False
    state_dict = checkpoint['state_dict']
    log.info('[model loaded.]')

    test, test_embedding, test_answer = load_dev_data(opt)
    model = QAModel(opt, state_dict = state_dict)
    log.info('[Data loaded.]')

    model.setup_eval_embed(test_embedding)

    if args.cuda:
        model.cuda()

    batches = BatchGen_QuAC(test, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, dialog_ctx=opt['explicit_dialog_ctx'], use_dialog_act=opt['use_dialog_act'], precompute_elmo=opt['elmo_batch_size'] // args.batch_size)
    sample_idx = random.sample(range(len(batches)), args.show)

    predictions = []
    no_ans_scores = []
    for i, batch in enumerate(batches):
        prediction, noans = model.predict(batch, No_Ans_Threshold=args.no_ans)
        predictions.extend(prediction)
        no_ans_scores.extend(noans)

        if not (i in sample_idx):
            continue
        
        print("Context: ", batch[-4][0])
        for j in range(len(batch[-2][0])):
            print("Q: ", batch[-2][0][j])
            print("A: ", prediction[0][j])
            print("     True A: ", batch[-1][0][j], "| Follow up" if batch[-6][0][j].item() // 10 else "| Don't follow up")
            print("     Val. A: ", test_answer[args.batch_size * i][j])
        print("")


    pred_out = {'predictions': predictions, 'no_ans_scores': no_ans_scores}
    with open(args.output, 'wb') as f:
        pickle.dump(pred_out, f)

    f1, h_f1, HEQ_Q, HEQ_D = score(predictions, test_answer, min_F1=args.min_f1)
    log.warning("Test F1: {:.2f}, HEQ_Q: {:.2f}, HEQ_D: {:.2f}".format(f1, HEQ_Q, HEQ_D))

Source File: coverage.py From bncov with MIT License

4 votes

def load_from_file(self, filename):
        """Reconstruct a CoverageDB using the current BinaryView and a CoverageDB saved to disk using .save_to_file()"""
        if file_backing_disabled:
            raise Exception("[!] Can't save/load coverage db files without msgpack. Try `pip install msgpack`")
        self.filename = filename
        with open(filename, "rb") as f:
            loaded_dict = msgpack.load(f, raw=False)
        if "version" not in loaded_dict:
            self._old_load_from_file(loaded_dict)
        # Do sanity checks
        loaded_version = int(loaded_dict["version"])
        if loaded_version != 1:
            raise Exception("[!] Unsupported version number: %d" % loaded_version)

        loaded_module_name = loaded_dict["module_name"]
        if loaded_module_name != self.module_name:
            raise Exception("[!] ERROR: Module name from covdb (%s) doesn't match BinaryView (%s)" %
                            (loaded_module_name, self.module_name))

        loaded_module_base = loaded_dict["module_base"]
        if loaded_module_base != self.module_base:
            raise Exception("[!] ERROR: Module base from covdb (0x%x) doesn't match BinaryView (0x%x)" %
                            (loaded_module_base, self.module_base))

        # Parse the saved members
        coverage_files = loaded_dict["coverage_files"]
        self.coverage_files = coverage_files

        block_dict = dict()
        loaded_block_dict = loaded_dict["block_dict"]
        file_index_map = {self.coverage_files.index(filepath): filepath for filepath in self.coverage_files}
        for block, trace_id_list in loaded_block_dict.items():
            trace_list = [file_index_map[i] for i in trace_id_list]
            block_dict[block] = trace_list
        self.block_dict = block_dict

        # Regen other members from saved members
        bv = self.bv
        self.module_blocks = {bb.start: bb.length for bb in bv.basic_blocks}
        trace_dict = {}
        for block, trace_list in block_dict.items():
            for name in trace_list:
                trace_dict.setdefault(name, set()).add(block)
        self.trace_dict = trace_dict
        self.total_coverage = set(block_dict.keys())

        # Other members are blank/empty
        self.function_stats = {}
        self.frontier = set()

Python msgpack.load() Examples