Python Examples of pandas.read

Source File: columnarStructure.py From mmtf-pyspark with Apache License 2.0

6 votes

def get_chain_to_entity_index(self):
        '''Returns an array that maps a chain index to an entity index

        Returns
        -------
        :obj:`array <numpy.ndarray>`
           index that maps chain index to an entity index
        '''

        if self.entityChainIndex is None:

            #self.entityChainIndex = np.empty(self.structure.num_chains, dtype='>i4')
            self.entityChainIndex = np.empty(self.structure.num_chains, dtype=np.int32)

            for i, entity in enumerate(self.structure.entity_list):

                chainIndexList = entity['chainIndexList']
                # pd.read_msgpack returns tuple, msgpack-python returns list
                if type(chainIndexList) is not list:
                    chainIndexList = list(chainIndexList)
                self.entityChainIndex[chainIndexList] = i

        return self.entityChainIndex

Source File: mmtfStructure.py From mmtf-pyspark with Apache License 2.0

6 votes

def chain_to_entity_index(self):
        '''Returns an array that maps a chain index to an entity index

        Returns
        -------
        :obj:`array <numpy.ndarray>`
           index that maps chain index to an entity index
        '''

        if self.entityChainIndex is None:
            self.entityChainIndex = np.empty(self.num_chains, dtype=np.int32)
            print("chain_to_entity_index: num_chains", self.num_chains)

            for i, entity in enumerate(self.entity_list):

                #chainIndexList = entity['chainIndexList']
                # pd.read_msgpack returns tuple, msgpack-python returns list
                # TODO check this
                #if type(chainIndexList) is not list:
                #    chainIndexList = list(chainIndexList)
                # TODO need to update entity_list when self.truncate
                for index in entity['chainIndexList']:
                    if index < self.num_chains:
                        self.entityChainIndex[index] = i

Source File: mmtfReader.py From mmtf-pyspark with Apache License 2.0

6 votes

def _call_mmtf(f, first_model=False):
    '''Call function for mmtf files'''

    if ".mmtf.gz" in f:
        name = f.split('/')[-1].split('.')[0].upper()
        data = gzip.open(f, 'rb')
        #unpack = msgpack.unpack(data, raw=False)
        unpack = pd.read_msgpack(data)
        decoder = MmtfStructure(unpack, first_model)
        return (name, decoder)

    elif ".mmtf" in f:
        #name = f.split('/')[-1].split('.')[0].upper()
        #unpack = msgpack.unpack(open(f, "rb"), raw=False)
        #decoder = MmtfStructure(unpack)
        name = f.split('/')[-1].split('.')[0].upper()
        unpack = pd.read_msgpack(f)
        decoder = MmtfStructure(unpack, first_model)
        return (name, decoder)

Source File: cache.py From catalyst with Apache License 2.0

6 votes

def __init__(self,
                 path=None,
                 lock=None,
                 clean_on_failure=True,
                 serialization='msgpack'):
        self.path = path if path is not None else mkdtemp()
        self.lock = lock if lock is not None else nop_context
        self.clean_on_failure = clean_on_failure

        if serialization == 'msgpack':
            self.serialize = pd.DataFrame.to_msgpack
            self.deserialize = pd.read_msgpack
            self._protocol = None
        else:
            s = serialization.split(':', 1)
            if s[0] != 'pickle':
                raise ValueError(
                    "'serialization' must be either 'msgpack' or 'pickle[:n]'",
                )
            self._protocol = int(s[1]) if len(s) == 2 else None

            self.serialize = self._serialize_pickle
            self.deserialize = pickle.load

        ensure_directory(self.path)

Source File: message.py From timeflux with MIT License

6 votes

def msgpack_deserialize(message):
    # TODO: handle meta and cases where data is None
    topic = message[0].decode("utf-8")
    data = message[1]
    return [topic, pd.read_msgpack(data)]


# def arrow_serialize(message):
#     topic = message[0].decode('utf-8')
#     df = message[1]
#     return [topic, pa.serialize(df).to_buffer()]

# def arrow_deserialize(message):
#     topic = message[0]
#     data = message[1]
#     return [topic, pa.deserialize(data)]

Source File: cache.py From git-pandas with BSD 3-Clause "New" or "Revised" License

5 votes

def get(self, orik):
        k = self.prefix + orik
        if self.exists(orik):
            return pd.read_msgpack(self._cache.get(k))
        else:
            try:
                idx = self._key_list.index(k)
                self._key_list.pop(idx)
            except ValueError as e:
                pass
            raise CacheMissException(k)

Source File: run_pandas.py From recipy with Apache License 2.0

5 votes

def read_msgpack(self):
        """
        Use pandas.read_msgpack to load dataframe.mpack.
        """
        file_name = os.path.join(self.data_dir, "dataframe.mpack")
        pd.read_msgpack(file_name)

Source File: mmtfReader.py From mmtf-pyspark with Apache License 2.0

5 votes

def _get_structure(pdbId, reduced, first_model):
    '''Download and decode a list of structure from a list of PDBid

    Parameters
    ----------
    pdbID : list
       List of structures to download

    Returns
    -------
    tuple
       pdbID and deccoder
    '''

    try:
        #unpack = default_api.get_raw_data_from_url(pdbId, reduced)
        url = default_api.get_url(pdbId, reduced)
        request = urllib2.Request(url)
        request.add_header('Accept-encoding', 'gzip')
        response = urllib2.urlopen(request)
        if response.info().get('Content-Encoding') == 'gzip':
            data = gzip.decompress(response.read())
        else:
            data = response.read()
        unpack = pd.read_msgpack(data)
        decoder = MmtfStructure(unpack, first_model)
        return (pdbId, decoder)
    except urllib.error.HTTPError:
        print(f"ERROR: {pdbId} is not a valid pdbId")

Source File: mmtfReader.py From mmtf-pyspark with Apache License 2.0

5 votes

def _call_sequence_file(t, first_model):
    '''Call function for hadoop sequence files'''
    # TODO: check if all sequence files are gzipped
    # data = default_api.ungzip_data(t[1])
    # unpack = msgpack.unpackb(data.read(), raw=False)
    # decoder = MmtfStructure(unpack)
    # return (str(t[0]), decoder)
    data = gzip.decompress(t[1])
    unpack = pd.read_msgpack(data)
    decoder = MmtfStructure(unpack, first_model)
    return (t[0], decoder)

Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License

5 votes

def df_from_bytes_msgpack_(bytes_: bytes) -> pd.DataFrame:
    try:
        df = pd.read_msgpack(BytesIO(bytes_))
    except UnicodeDecodeError:
        raise DataFrameLoadException("Not a DataFrame")
    if not isinstance(df, pd.DataFrame):
        raise DataFrameLoadException("Not a DataFrame")
    return df

Source File: stock_resampler.py From QUANTAXIS_RealtimeCollector with MIT License

4 votes

def on_message_callback(self, channel, method, properties, body):
        context = pd.read_msgpack(body)
        # merge update
        if self.market_data is None:
            # self.market_data = context
            pass
        else:
            logger.info("Before market_data, concat and update start, 合并市场数据")
            cur_time = datetime.datetime.now()
            self.market_data.update(context)
            end_time = datetime.datetime.now()
            cost_time = (end_time - cur_time).total_seconds()
            logger.info("Before market_data, concat and update end, 合并市场数据, 耗时,cost: %s s" % cost_time)
            logger.info(self.market_data.to_csv(float_format='%.3f'))
            filename = get_file_name_by_date('stock.market.%s.csv', self.log_dir)
            # 不追加，复写
            logging_csv(self.market_data, filename, index=True, mode='w')

        # group by code and resample
        try:
            cur_time = datetime.datetime.now()
            bar_data: pd.DataFrame = tdx_stock_bar_resample_parallel(
                self.market_data[self.market_data.close > 0], self.frequency, jobs=self.cpu_count
            )
            end_time = datetime.datetime.now()
            cost_time = (end_time - cur_time).total_seconds()
            logger.info("数据重采样耗时,cost: %s" % cost_time)
            logger.info("发送重采样数据中start")
            self.publish_msg(bar_data.to_msgpack())
            logger.info("发送重采样数据完毕end")

            logger.info(bar_data.to_csv(float_format='%.3f'))
            filename = get_file_name_by_date('stock.bar.%s.csv', self.log_dir)
            # 不追加，复写
            logging_csv(bar_data, filename, index=True, mode='w')
            del bar_data
        except Exception as e:
            logger.error("failure股票重采样数据. " + e.__str__())
        finally:
            logger.info("重采样计数 count : %s" % self.count)
        self.count += 1
        del context

Python pandas.read_msgpack() Examples