Python Examples of ijson.items

Source File: migrate_json.py From examples with Apache License 2.0

7 votes

def load_data_into_grakn(input, session):
    '''
      loads the json data into our Grakn phone_calls keyspace:
      1. gets the data items as a list of dictionaries
      2. for each item dictionary
        a. creates a Grakn transaction
        b. constructs the corresponding Graql insert query
        c. runs the query
        d. commits the transaction
      :param input as dictionary: contains details required to parse the data
      :param session: off of which a transaction will be created
    '''
    items = parse_data_to_dictionaries(input)  # 1

    for item in items:  # 2
        with session.transaction().write() as transaction:  # a
            graql_insert_query = input["template"](item)  # b
            print("Executing Graql Query: " + graql_insert_query)
            transaction.query(graql_insert_query)  # c
            transaction.commit()  # d

    print("\nInserted " + str(len(items)) +
          " items from [ " + input["file"] + ".json] into Grakn.\n")

Source File: query.py From mixpanel-jql with MIT License

6 votes

def _validate_people_params(self, params):
        if not params:
            return "{}"
        if not isinstance(params, dict):
            raise JQLSyntaxError("people_params must be a dict")
        for k, v in params.items():
            if k != 'user_selectors':
                raise JQLSyntaxError('"%s" is not a valid key in people_params' % k)
            if not isinstance(v, collections.Iterable):
                raise JQLSyntaxError("people_params['user_selectors'] must be iterable")
            for i, e in enumerate(v):
                for ek, ev in e.items():
                    if ek not in ('selector',):
                        raise JQLSyntaxError(
                            "'%s' is not a valid key in "
                            "people_params['user_selectors'][%s]" % (ek, i))
                    elif not isinstance(ev, six.string_types):
                        raise JQLSyntaxError(
                                "people_params['user_selectors'][%s].%s "
                                "must be a string" % (i, ek))
        return json.dumps(params)

Source File: client.py From anchore-engine with Apache License 2.0

6 votes

def _extract_response_data(self, response_text):
        next_token = None
        sio = BytesIO(response_text)
        count = 0

        # Get the next token
        p = ijson.items(sio, FEED_DATA_NEXT_TOKEN_PATH)
        d = [x for x in p]
        if len(d) == 1:
            next_token = d[0]

        # Be explicit, no empty strings
        if not next_token:
            next_token = None

        # Get the record count
        # Not using the special parser for handling decimals here because this isn't on the return path, just counting records
        sio.seek(0)
        for i in ijson.items(sio, FEED_DATA_ITEMS_PATH):
            count += 1

        logger.debug('Found {} records in data chunk'.format(count))
        sio.close()

        return next_token, response_text, count

Source File: json.py From tabulator-py with MIT License

6 votes

def __iter_extended_rows(self):
        path = 'item'
        if self.__property is not None:
            path = '%s.item' % self.__property
        items = ijson.items(self.__bytes, path)
        for row_number, item in enumerate(items, start=1):
            if isinstance(item, (tuple, list)):
                yield (row_number, None, list(item))
            elif isinstance(item, dict):
                keys = []
                values = []
                for key in sorted(item.keys()):
                    keys.append(key)
                    values.append(item[key])
                yield (row_number, list(keys), list(values))
            else:
                if not self.__force_parse:
                    message = 'JSON item has to be list or dict'
                    raise exceptions.SourceError(message)
                yield (row_number, None, [])

Source File: metrics.py From dstc-noesis with MIT License

6 votes

def rank(src, tgt):
    """
    The function calculates rank for each prediction given target

    Args:
        src (dict): predictions by the model
        tgt (dict): ground truth/ targets

    Returns:
         ranks (list): rank of a correct responses (default = 0)
    """
    ranks = []
    for idx, target in tgt.items():
        ranks.append(0)
        try:
            predictions = src[idx]
            for i, entry in enumerate(predictions):
                if entry in target:
                    ranks[-1] = i + 1
                    break
        except KeyError:
            msg = "No matching entry found for test case with dialog-id {}".format(idx)
            logging.warning(msg)

    return ranks

Source File: language_modeling.py From prenlp with Apache License 2.0

6 votes

def _get_data(self) -> list:
        out_path_train = self.root/self.out_filename

        if out_path_train.exists():
            train = load_language_modeling(out_path_train)
            dataset = train
        else:
            dataset = []
            with open(self.root/self.dirname, 'r', encoding='utf-8') as jfile:
                for item in tqdm(ijson.items(jfile, 'item')):
                    text = self._normalize(item['text']).strip()
                    samples = list(filter(lambda x: len(x) > 0, text.split('\n'))) # split document into sentences(len > 0)
                    dataset += samples
                    # If sample is a document, use below code not above two lines.
                    # sample = '\n'.join(list(filter(lambda x: len(x) > 0, text.split('\n'))))
                    # dataset.append(sample)
                    
            # Save dataset
            (self.root/self.dirname).unlink()
            save_language_modeling(dataset, to_path=out_path_train)
            
        return dataset

Source File: query.py From mixpanel-jql with MIT License

6 votes

def _decode(entity):
    """
    Decodes all unicode characters to avoid the `u` prefix from
    proliferating in complex data structures. We should probably
    instead JSON encode everything, but for now, this is fine.

    This is only needed as long as Python 2 support is necessary.

    :param entity: The entity to decode.
    :return: The iterable without unicode.
    """

    # Only necessary for Python 2
    if six.PY3:
        return entity
    if isinstance(entity, tuple):
        return tuple(_decode(e) for e in entity)
    elif isinstance(entity, list):
        return list(_decode(e) for e in entity)
    elif isinstance(entity, dict):
        return {_decode(k): _decode(v) for k, v in entity.items()}
    elif isinstance(entity, six.text_type):
        return entity.encode('utf8')
    return entity

Source File: convert_dstc8_data.py From NOESIS-II with Apache License 2.0

6 votes

def create_test_answers_file(test_file, test_answers_file):
    answers = {}

    with open(test_file, 'rb') as f:
        json_data = ijson.items(f, 'item')
        for entry in json_data:
            for i, utterance in enumerate(entry['options-for-next']):
                answer = utterance['utterance'] + " __eou__ "
                answer_id = utterance['candidate-id']
                answers[answer_id] = answer

    answers["NONE"] = "None __eou__ "
    with open(test_answers_file, "w") as vocabfile:
        for answer_id, answer in answers.items():
            vocabfile.write(str(answer_id) + "\t" + answer.replace("\n", "") + "\n")
    print("Saved test answers to {}".format(test_answers_file))

    return answers

Source File: parsers.py From riko with MIT License

6 votes

def etree2dict(element):
    """Convert an element tree into a dict imitating how Yahoo Pipes does it.
    """
    i = dict(element.items())
    i.update(_make_content(i, element.text, strip=True))

    for child in element:
        tag = child.tag
        value = etree2dict(child)
        i.update(_make_content(i, value, tag))

    if element.text and not set(i).difference(['content']):
        # element is leaf node and doesn't have attributes
        i = i.get('content')

    return i

Source File: geo_heatmap.py From geo-heatmap with MIT License

5 votes

def generateMap(self, settings):
        """Generates the heatmap.
        
        Arguments:
            settings {dict} -- The settings for the heatmap.
        
        Returns:
            Map -- The Heatmap.
        """
        tiles = settings["tiles"]
        zoom_start = settings["zoom_start"]
        radius = settings["radius"]
        blur = settings["blur"]
        min_opacity = settings["min_opacity"]
        max_zoom = settings["max_zoom"]
        
        map_data = [(coords[0], coords[1], magnitude)
                    for coords, magnitude in self.coordinates.items()]

        # Generate map
        m = folium.Map(location=self.max_coordinates,
                       zoom_start=zoom_start,
                       tiles=tiles)

        # Generate heat map
        heatmap = HeatMap(map_data,
                          max_val=self.max_magnitude,
                          min_opacity=min_opacity,
                          radius=radius,
                          blur=blur,
                          max_zoom=max_zoom)

        m.add_child(heatmap)
        return m

Source File: SwiftKitten.py From SwiftKitten with MIT License

5 votes

def _get_structure_info(self, view):
        """
        """
         #get structure info command
        text = view.substr(Region(0, view.size()))
        cmd = self.get_structure_info_cmd(view, text)
        timeout = self.get_settings(view, "sourcekitten_timeout", 1.0)

        # run structure info command
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
        structure_info = list(ijson.items(p.stdout,''))[0]

        return structure_info

Source File: io.py From meza with MIT License

5 votes

def read_json(filepath, mode='r', path='item', newline=False):
    """Reads a json file (both regular and newline-delimited)

    Args:
        filepath (str): The json file path or file like object.
        mode (Optional[str]): The file open mode (default: 'r').
        path (Optional[str]): Path to the content you wish to read
            (default: 'item', i.e., the root list). Note: `path` must refer to
            a list.

        newline (Optional[bool]): Interpret file as newline-delimited
            (default: False).

    Kwargs:
        encoding (str): File encoding.

    Returns:
        Iterable: The parsed records

    See also:
        `meza.io.read_any`

    Examples:
        >>> filepath = p.join(DATA_DIR, 'test.json')
        >>> records = read_json(filepath)
        >>> next(records) == {
        ...     'text': 'Chicago Reader',
        ...     'float': 1,
        ...     'datetime': '1971-01-01T04:14:00',
        ...     'boolean': True,
        ...     'time': '04:14:00',
        ...     'date': '1971-01-01',
        ...     'integer': 40}
        True
    """
    reader = lambda f, **kw: map(json.loads, f) if newline else items(f, path)
    return read_any(filepath, reader, mode)

Source File: io.py From meza with MIT License

5 votes

def _remove_bom_from_dict(row, bom):
    """Remove a byte order marker (BOM) from a dict"""
    for k, v in row.items():
        try:
            if all([k, v, bom in k, bom in v]):
                yield (k.lstrip(bom), v.lstrip(bom))
            elif v and bom in v:
                yield (k, v.lstrip(bom))
            elif k and bom in k:
                yield (k.lstrip(bom), v)
            else:
                yield (k, v)
        except TypeError:
            yield (k, v)

Source File: update_dtp.py From dtp-stat with GNU General Public License v2.0

5 votes

def main():
    with open("data/dtp.json", 'r') as f:
        objects = ijson.items(f, 'item')
        for row in tqdm(objects):
            get_crashes_data(row)

Source File: compare_messages.py From zulip with Apache License 2.0

5 votes

def handle(self, *args: Any, **options: Any) -> None:
        total_count = 0
        changed_count = 0
        with open(options['dump1']) as dump1, open(options['dump2']) as dump2:
            for m1, m2 in zip(ijson.items(dump1, 'item'), ijson.items(dump2, 'item')):
                total_count += 1
                if m1['id'] != m2['id']:
                    self.stderr.write('Inconsistent messages dump')
                    break
                if m1['content'] != m2['content']:
                    changed_count += 1
                    self.stdout.write('Changed message id: {id}'.format(id=m1['id']))
        self.stdout.write(f'Total messages: {total_count}')
        self.stdout.write(f'Changed messages: {changed_count}')

Source File: prepare_data.py From dstc-noesis with MIT License

5 votes

def create_dialog_iter(filename):
    """
    Returns an iterator over a JSON file.
    :param filename:
    :return:
    """
    with open(filename, 'rb') as f:
        json_data = ijson.items(f, 'item')
        for entry in json_data:
            row = process_dialog(entry)
            yield row

Source File: utils.py From dstc-noesis with MIT License

5 votes

def read_json(input_file):
    json_objects_lst = list()
    json_objects = ijson.items(input_file, 'item')
    for obj in json_objects:
        json_objects_lst.append(obj)
    return json_objects_lst

Source File: metrics.py From dstc-noesis with MIT License

5 votes

def calculate_MAP(src, tgt):
    """
    The function calculate Mean Average Precision (MAP).
    Args:
        src (dict): predictions by the model
        tgt (dict): ground truth/ targets
    """
    avg_precision = list()
    for idx, targets in tgt.items():
        try:
            predictions = src[idx]
            precision = list()
            for i, target in enumerate(targets):
                try:
                    precision.append(((i + 1) / (predictions.index(target) + 1)))
                except ValueError:
                    msg = "Answer: {} isn't part of the predictions by the model.".format(target)
                    logging.warning(msg)

            avg_precision.append(sum(precision) / len(targets))
        except KeyError:
            msg = "No matching entry found for test case with dialog-id {}".format(idx)
            logging.warning(msg)

    map = sum(avg_precision)/len(tgt)
    msg = "Mean Average Precision (MAP): {}".format(map)
    logging.info(msg)

Source File: exchangerate.py From riko with MIT License

5 votes

def parse_response(json):
    if 'rates' in json:
        resp = {k: Decimal(v) for k, v in json['rates'].items() if v}
    else:
        logger.warning('invalid json response:')
        logger.warning(json)
        resp = {}

    return resp

Source File: metrics.py From dstc-noesis with MIT License

5 votes

def read_predictions(filename):
    predictions = OrderedDict()
    with open(filename, 'r') as fp:
        for item in ijson.items(fp, 'item'):
            predictions[item['example-id']] = [candidate['candidate-id'] for candidate in item['candidate-ranking']]
    return predictions

Source File: parsers.py From riko with MIT License

5 votes

def any2dict(f, ext='xml', html5=False, path=None):
    path = path or ''

    if ext in {'xml', 'html'}:
        xml = ext == 'xml'
        root = xml2etree(f, xml, html5).getroot()
        replaced = '/'.join(path.split('.'))
        tree = next(xpath(root, replaced)) if replaced else root
        content = etree2dict(tree)
    elif ext == 'json':
        content = next(items(f, path))
    else:
        raise TypeError("Invalid file type: '%s'" % ext)

    return content

Source File: geo_heatmap.py From geo-heatmap with MIT License

5 votes

def streamJSONData(self, json_file, date_range):
        """Stream the Google location data from the given json file.
        
        Arguments:
            json_file {file} -- An open file-like object with JSON-encoded
                Google location data.
            date_range {tuple} -- A tuple containing the min-date and max-date.
                e.g.: (None, None), (None, '2019-01-01'), ('2017-02-11'), ('2019-01-01')
        """
        # Estimate location amount
        max_value_est = sum(1 for line in json_file) / 13
        json_file.seek(0)
        
        locations = ijson.items(json_file, "locations.item")
        w = [Bar(), Percentage(), " ", ETA()]
        with ProgressBar(max_value=max_value_est, widgets=w) as pb:
            for i, loc in enumerate(locations):
                if "latitudeE7" not in loc or "longitudeE7" not in loc:
                    continue
                coords = (round(loc["latitudeE7"] / 1e7, 6),
                            round(loc["longitudeE7"] / 1e7, 6))

                if timestampInRange(loc["timestampMs"], date_range):
                    self.updateCoord(coords)
                    
                if i > max_value_est:
                    max_value_est = i
                    pb.max_value = i
                pb.update(i)

Source File: query.py From mixpanel-jql with MIT License

5 votes

def send(self):
        with closing(requests.post(self.ENDPOINT % self.VERSION,
                                   auth=HTTPBasicAuth(self.api_secret, ''),
                                   data={'script': str(self)},
                                   stream=True)) as resp:
            resp.raise_for_status()
            for row in ijson.items(RequestsStreamWrapper(resp), 'item'):
                yield row

Source File: query.py From mixpanel-jql with MIT License

5 votes

def _validate_join_params(self, params):
        if not params:
            return "{}"
        if not isinstance(params, dict):
            raise JQLSyntaxError("join_params must be a dict")
        for k, v in params.items():
            if k == 'type':
                if v not in self.VALID_JOIN_TYPES:
                    raise JQLSyntaxError(
                        '"%s" is not a valid join type (valid types: %s)'
                        % (v, ', '.join(self.VALID_JOIN_TYPES))
                    )
            elif k == 'selectors':
                if not isinstance(v, collections.Iterable):
                    raise JQLSyntaxError("join_params['selectors'] must be iterable")
                for i, e in enumerate(v):
                    if not isinstance(e, dict):
                        raise JQLSyntaxError("join_params['selectors'][x] must be a dict")
                    for ek, ev in e.items():
                        if ek not in ('event', 'selector'):
                            raise JQLSyntaxError(
                                "'%s' is not a valid key in "
                                "join_params['selectors'][%s]" % (ek, i))
                        elif not isinstance(ev, six.string_types):
                            raise JQLSyntaxError(
                                "join_params['selectors'][%s].%s "
                                "must be a string" % (i, ek))
            else:
                raise JQLSyntaxError('"%s" is not a valid key in join_params' % k)
        return json.dumps(params)

Source File: query.py From mixpanel-jql with MIT License

5 votes

def _validate_event_params(self, params):
        if not params:
            return "{}"
        if not isinstance(params, dict):
            raise JQLSyntaxError("event_params must be a dict")
        params = dict(params)
        for k, v in params.items():
            if k in ('to_date', 'from_date'):
                if isinstance(v, (datetime, date,)):
                    params[k] = v.strftime('%Y-%m-%d')
                elif not isinstance(v, six.string_types):
                    raise JQLSyntaxError('to_date must be datetime, datetime.date, or str')
            elif k == 'event_selectors':
                if not isinstance(v, collections.Iterable):
                    raise JQLSyntaxError("event_params['event_selectors'] must be iterable")
                for i, e in enumerate(v):
                    if not isinstance(e, dict):
                        raise JQLSyntaxError("event_params['event_selectors'][x] must be a dict")
                    for ek, ev in e.items():
                        if ek not in ('event', 'selector', 'label'):
                            raise JQLSyntaxError(
                                "'%s' is not a valid key in "
                                "event_params['event_selectors'][%s]" % (ek, i))
                        elif not isinstance(ev, six.string_types):
                            raise JQLSyntaxError(
                                "event_params['event_selectors'][%s].%s "
                                "must be a string" % (i, ek))
            else:
                raise JQLSyntaxError('"%s" is not a valid key in event_params' % k)
        return json.dumps(params)

Source File: convert_dstc8_data.py From NOESIS-II with Apache License 2.0

5 votes

def get_dialogs(filename):
    rows = []
    with open(filename, 'rb') as f:
        json_data = ijson.items(f, 'item')
        for entry in json_data:
            rows.append(process_dialog(entry, train=True, positive=True))
            rows.extend(process_dialog(entry, train=True, positive=False, all_negative=True))
    return rows

Source File: migrate_json.py From examples with Apache License 2.0

5 votes

def parse_data_to_dictionaries(input):
    '''
      1. reads the file through a stream,
      2. adds the dictionary to the list of items
      :param input.file as string: the path to the data file, minus the format
      :returns items as list of dictionaries: each item representing a data item from the file at input.file
    '''
    items = []
    with open(input["file"] + ".json") as data:  # 1
        for item in ijson.items(data, "item"):
            items.append(item)  # 2
    return items

Source File: exchangerate.py From riko with MIT License

4 votes

def parser(base, objconf, skip=False, **kwargs):
    """ Parses the pipe content

    Args:
        base (str): The base currency (exchanging from)
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content
        kwargs (dict): Keyword arguments

    Kwargs:
        assign (str): Attribute to assign parsed content (default: exchangerate)
        stream (dict): The original item

    Returns:
        dict: The item

    Examples:
        >>> from riko import get_path
        >>> from meza.fntools import Objectify
        >>>
        >>> url = get_path('quote.json')
        >>> conf = {'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6}
        >>> item = {'content': 'GBP'}
        >>> objconf = Objectify(conf)
        >>> kwargs = {'stream': item, 'assign': 'content'}
        >>> parser(item['content'], objconf, **kwargs)
        Decimal('1.275201')
    """
    same_currency = base == objconf.currency

    if skip:
        rate = kwargs['stream']
    elif same_currency:
        rate = Decimal(1)
    else:
        decode = objconf.url.startswith('http')

        with fetch(decode=decode, **objconf) as f:
            try:
                json = next(items(f, ''))
            except Exception as e:
                f.seek(0)
                logger.error('Error parsing {url}'.format(**objconf))
                logger.debug(f.read())
                logger.error(e)
                logger.error(traceback.format_exc())
                skip = True
                rate = 0

    if not (skip or same_currency):
        places = Decimal(10) ** -objconf.precision
        rates = parse_response(json)
        rate = calc_rate(base, objconf.currency, rates, places=places)

    return rate

Source File: exchangerate.py From riko with MIT License

4 votes

def async_pipe(*args, **kwargs):
    """A processor that asynchronously retrieves the current exchange rate
    for a given currency pair.

    Args:
        item (dict): The entry to process
        kwargs (dict): The keyword arguments passed to the wrapper

    Kwargs:
        conf (dict): The pipe configuration. May contain the keys 'url',
            'params', 'currency', 'delay', 'memoize', or 'field'.

            url (str): The exchange rate API url (default:
                http://finance.yahoo.com...)

            params (dict): The API url parameters (default: {'format': 'json'})
            currency: The (exchanging to) currency ISO abbreviation (default:
                USD).

            delay (flt): Amount of time to sleep (in secs) before fetching the
                url. Useful for simulating network latency. Default: 0.

            memoize (bool): Cache the exchange rate API response (default:
                False).

        field (str): Item attribute from which to obtain the string to be
            formatted (default: 'content')

        assign (str): Attribute to assign parsed content (default:
            exchangerate)

    Returns:
        dict: twisted.internet.defer.Deferred stream of items

    Examples:
        >>> from riko import get_path
        >>> from riko.bado import react
        >>> from riko.bado.mock import FakeReactor
        >>>
        >>> def run(reactor):
        ...     callback = lambda x: print(next(x)['exchangerate'])
        ...     url = get_path('quote.json')
        ...     d = async_pipe({'content': 'GBP'}, conf={'url': url})
        ...     return d.addCallbacks(callback, logger.error)
        >>>
        >>> try:
        ...     react(run, _reactor=FakeReactor())
        ... except SystemExit:
        ...     pass
        ...
        1.275201
    """
    return async_parser(*args, **kwargs)

Source File: convert_dstc8_data.py From NOESIS-II with Apache License 2.0

4 votes

def create_dev_file(dev_file, dev_file_out, answers):
    dev_file_op = open(dev_file_out, "w")
    positive_samples_count = 0
    negative_samples_count = 0

    dev_data_handle = open(dev_file, 'rb')
    json_data = ijson.items(dev_data_handle, 'item')
    for index, entry in enumerate(json_data):
        row = str(index+1) + "\t"
        context = get_context(entry)
        row += context + "\t"

        if len(entry['options-for-correct-answers']) == 0:
            correct_answer = {}
            correct_answer['utterance'] = "None"
            target_id = "NONE"
        else:
            correct_answer = entry['options-for-correct-answers'][0]
            target_id = correct_answer['candidate-id']
        answer = correct_answer['utterance'] + " __eou__ "
        answer = answer.strip()
        row += str(answers[answer] + 1) + "\t"
        positive_samples_count += 1

        negative_answers = []
        for i, utterance in enumerate(entry['options-for-next']):
            if utterance['candidate-id'] == target_id:
                continue
            answer = utterance['utterance'] + " __eou__ "
            answer = answer.strip()
            negative_answers.append(str(answers[answer] + 1))
            negative_samples_count += 1

        if len(negative_answers) < 100:
            answer = "None __eou__"
            negative_answers.append(str(answers[answer] + 1))
            negative_samples_count += 1

        negative_answers = "|".join(negative_answers)
        row += negative_answers + "\t"
        dev_file_op.write(row.replace("\n", "") + "\n")

    print("Saved dev data to {}".format(dev_file_out))
    print("Dev - Positive samples count - {}".format(positive_samples_count))
    print("Dev - Negative samples count - {}".format(negative_samples_count))
    dev_file_op.close()

Python ijson.items() Examples