Python Examples of unicodecsv.DictReader

Source File: test.py From pyRevit with GNU General Public License v3.0

6 votes

def test_read_short(self):
        fd, name = tempfile.mkstemp()
        fileobj = os.fdopen(fd, "w+b")
        try:
            fileobj.write(b"1,2,abc,4,5,6\r\n1,2,abc\r\n")
            fileobj.seek(0)
            reader = csv.DictReader(fileobj,
                                    fieldnames="1 2 3 4 5 6".split(),
                                    restval="DEFAULT")
            self.assertEqual(next(reader), {"1": '1', "2": '2', "3": 'abc',
                                             "4": '4', "5": '5', "6": '6'})
            self.assertEqual(next(reader), {"1": '1', "2": '2', "3": 'abc',
                                             "4": 'DEFAULT', "5": 'DEFAULT',
                                             "6": 'DEFAULT'})
        finally:
            fileobj.close()
            os.unlink(name)

Source File: make_grades_persistent.py From edx2bigquery with GNU General Public License v2.0

6 votes

def cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="passed_timestamp"):
    """
    Removes the null values from grades_persistentcoursegrade.csv.gz.
    The function also fixes course ids by changing them from their
    edX URL format to their usual format. For instance,
    course-v1:MITx+STL.162x+2T2017 should be MITx/STL.162x/2T2017.

    This operation permanently modifies the CSV.

    :param csvfn: The path of the csv.gz to be modified
    :param tempfn: The path of the temporary csv.gz
    :type csvfn: str
    :type tempfn: str
    """
    with gzip.open(csvfn, "r") as open_csv:
        csv_dict = csv.DictReader(open_csv)
        with gzip.open(tempfn, "w+") as write_csv_file:
            write_csv = csv.DictWriter(write_csv_file, fieldnames=csv_dict.fieldnames)
            write_csv.writeheader()
            for row in csv_dict:
                row_dict = remove_nulls_from_row(row, field_to_fix)
                row_dict = fix_course_ids(row_dict)
                write_csv.writerow(row_dict)
    os.rename(tempfn, csvfn)

Source File: test.py From pyRevit with GNU General Public License v3.0

6 votes

def test_read_dict_no_fieldnames(self):
        fd, name = tempfile.mkstemp()
        fileobj = os.fdopen(fd, "w+b")
        try:
            fileobj.write(b"f1,f2,f3\r\n1,2,abc\r\n")
            fileobj.seek(0)
            reader = csv.DictReader(fileobj)
            self.assertEqual(reader.fieldnames,
                             ["f1", "f2", "f3"])
            self.assertEqual(next(reader),
                             {"f1": '1', "f2": '2', "f3": 'abc'})
        finally:
            fileobj.close()
            os.unlink(name)

    # Two test cases to make sure existing ways of implicitly setting
    # fieldnames continue to work.  Both arise from discussion in issue3436.

Source File: print_batch.py From adversarial-squad with MIT License

6 votes

def pred_human_eval():
  all_preds = collections.defaultdict(list)
  with open(OPTS.filename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      all_preds[row['Input.qid']].append(row['Answer.response'])
  preds = {}
  for qid in all_preds:
    if OPTS.ensemble:
      for a in all_preds[qid]:
        count = sum(1 for pred in all_preds[qid] if a == pred)
        if count > 1:
          preds[qid] = a
          break
      else:
        preds[qid] = random.sample(all_preds[qid], 1)[0]
    else:
      preds[qid] = random.sample(all_preds[qid], 1)[0]
  print json.dumps(preds)

Source File: test_functional.py From doorman with MIT License

6 votes

def test_node_csv_download(self, node, testapp):
        import unicodecsv as csv

        node.enrolled_on = dt.datetime.utcnow()
        node.last_checkin = dt.datetime.utcnow()
        node.last_ip = '1.1.1.1'
        node.node_info = {'hardware_vendor': "Honest Achmed's Computer Supply"}
        node.save()

        resp = testapp.get(url_for('manage.nodes_csv'))

        assert resp.headers['Content-Type'] == 'text/csv; charset=utf-8'
        assert resp.headers['Content-Disposition'] == 'attachment; filename=nodes.csv'

        reader = csv.DictReader(io.BytesIO(resp.body))
        row = next(reader)

        assert row['Display Name'] == node.display_name
        assert row['Host Identifier'] == node.host_identifier
        assert row['Enrolled On'] == str(node.enrolled_on)
        assert row['Last Check-In'] == str(node.last_checkin)
        assert row['Last Ip Address'] == node.last_ip
        assert row['Is Active'] == 'True'
        assert row['Make'] == node.node_info['hardware_vendor']

Source File: print_batch.py From adversarial-squad with MIT License

6 votes

def dump_verify():
  with open(OPTS.filename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      if OPTS.worker and row['WorkerId'] != OPTS.worker: continue
      if row['AssignmentStatus'] == 'Rejected': continue
      print 'HIT %s' % row['HITId']
      print 'WorkerId: %s' % row['WorkerId']
      print 'Time: %s s' % row['WorkTimeInSeconds']
      qids = row['Input.qids'].split('\t')
      questions = row['Input.questions'].split('\t')
      sents = row['Answer.sents'].split('\t')
      responses = row['Answer.responses'].split('\t')
      for qid, q, s_str, response_str in zip(
          qids, questions, sents, responses):
        print ('  Example %s' % qid)
        print ('  Question %s' % q)
        s_list = s_str.split('|')
        a_list = response_str.split('|')
        for s, a in zip(s_list, a_list):
          print ('    Sentence: %s' % sent_format(s)).encode('utf-8')
          print ('    Is good?  %s' % colored(a, 'cyan'))

Source File: print_batch.py From adversarial-squad with MIT License

6 votes

def dump_grammar():
  with open(OPTS.filename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      if OPTS.worker and row['WorkerId'] != OPTS.worker: continue
      if row['AssignmentStatus'] == 'Rejected': continue
      print 'HIT %s' % row['HITId']
      print 'WorkerId: %s' % row['WorkerId']
      print 'Time: %s s' % row['WorkTimeInSeconds']
      input_qids = row['Input.qids'].split('\t')
      input_sents = row['Input.sents'].split('\t')
      ans_is_good = row['Answer.is-good'].split('\t')
      ans_responses = row['Answer.responses'].split('\t')
      for qid, s, is_good, response in zip(input_qids, input_sents, 
                                           ans_is_good, ans_responses):
        print ('  Example %s' % qid)
        print ('    Sentence: %s' % s).encode('utf-8')
        print ('    Is good?  %s' % is_good)
        print ('    Response: %s' % colored(response, 'cyan')).encode('utf-8')

Source File: print_batch.py From adversarial-squad with MIT License

5 votes

def stats_grammar():
  # Read data
  worker_to_is_good = collections.defaultdict(list)
  worker_to_times = collections.defaultdict(list)
  with open(OPTS.filename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      if row['AssignmentStatus'] == 'Rejected': continue
      worker_id = row['WorkerId']
      ans_is_good = row['Answer.is-good'].split('\t')
      time = float(row['WorkTimeInSeconds'])
      worker_to_is_good[worker_id].extend(ans_is_good)
      worker_to_times[worker_id].append(time)

  # Aggregate by worker
  print '%d total workers' % len(worker_to_times)
  worker_stats = {}
  for worker_id in worker_to_times:
    times = sorted(worker_to_times[worker_id])
    t_median = times[len(times)/2]
    t_mean = sum(times) / float(len(times))
    is_good_list = worker_to_is_good[worker_id]
    num_qs = len(is_good_list)
    frac_good = sum(1.0 for x in is_good_list if x == 'yes') / num_qs
    worker_stats[worker_id] = (t_median, t_mean, num_qs, frac_good)

  # Print
  sorted_ids = sorted(list(worker_stats), key=lambda x: worker_stats[x][3])
  for worker_id in sorted_ids:
    t_median, t_mean, num_qs, frac_good = worker_stats[worker_id]
    print 'Worker %s: t_median %.1f, t_mean %.1f, %d questions, %.1f%% good' % (
        worker_id, t_median, t_mean, num_qs, 100.0 * frac_good)

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_read_long_with_rest_no_fieldnames(self):
        fd, name = tempfile.mkstemp()
        fileobj = os.fdopen(fd, "w+b")
        try:
            fileobj.write(b"f1,f2\r\n1,2,abc,4,5,6\r\n")
            fileobj.seek(0)
            reader = csv.DictReader(fileobj, restkey="_rest")
            self.assertEqual(reader.fieldnames, ["f1", "f2"])
            self.assertEqual(next(reader), {"f1": '1', "f2": '2',
                                             "_rest": ["abc", "4", "5", "6"]})
        finally:
            fileobj.close()
            os.unlink(name)

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_read_multi(self):
        sample = [
            b'2147483648,43.0e12,17,abc,def\r\n',
            b'147483648,43.0e2,17,abc,def\r\n',
            b'47483648,43.0,170,abc,def\r\n'
            ]

        reader = csv.DictReader(sample,
                                fieldnames="i1 float i2 s1 s2".split())
        self.assertEqual(next(reader), {"i1": '2147483648',
                                         "float": '43.0e12',
                                         "i2": '17',
                                         "s1": 'abc',
                                         "s2": 'def'})

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_read_with_blanks(self):
        reader = csv.DictReader([b"1,2,abc,4,5,6\r\n", b"\r\n",
                                 b"1,2,abc,4,5,6\r\n"],
                                fieldnames="1 2 3 4 5 6".split())
        self.assertEqual(next(reader), {"1": '1', "2": '2', "3": 'abc',
                                         "4": '4', "5": '5', "6": '6'})
        self.assertEqual(next(reader), {"1": '1', "2": '2', "3": 'abc',
                                         "4": '4', "5": '5', "6": '6'})

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_empty_file(self):
        csv.DictReader(BytesIO())

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_decode_error_dictreader(self):
        """Make sure the error-handling mode is obeyed on DictReaders."""
        file = EncodedFile(BytesIO(u'name,height,weight\nLöwis,2,3'.encode('iso-8859-1')),
                           data_encoding='iso-8859-1')
        reader = csv.DictReader(file, encoding='ascii', errors='ignore')
        self.assertEqual(list(reader)[0]['name'], 'Lwis')

Source File: utils.py From edx-enterprise with GNU Affero General Public License v3.0

5 votes

def validate_csv(file_stream, expected_columns=None):
    """
    Validate csv file for encoding and expected header.

    Args:
        file_stream: input file
        expected_columns: list of column names that are expected to be present in csv

    Returns:
       reader: an iterable for csv datat if csv passes the validation

    Raises:
        ValidationError
    """
    try:
        reader = unicodecsv.DictReader(file_stream, encoding="utf-8")
        reader_fieldnames = reader.fieldnames
    except (unicodecsv.Error, UnicodeDecodeError):
        raise ValidationError(ValidationMessages.INVALID_ENCODING)

    if expected_columns and set(expected_columns) - set(reader_fieldnames):
        raise ValidationError(ValidationMessages.MISSING_EXPECTED_COLUMNS.format(
            expected_columns=", ".join(expected_columns), actual_columns=", ".join(reader.fieldnames)
        ))

    return reader

Source File: gen_csv_verify.py From adversarial-squad with MIT License

5 votes

def read_sentences():
  id_to_sents = collections.defaultdict(list)
  with open(OPTS.batch_file) as f:
    reader = csv.DictReader(f)
    for row in reader:
      input_qids = row['Input.qids'].split('\t')
      input_sents = row['Input.sents'].split('\t')
      ans_is_good = row['Answer.is-good'].split('\t')
      ans_responses = row['Answer.responses'].split('\t')
      for qid, s, is_good, response in zip(input_qids, input_sents, ans_is_good, ans_responses):
        if is_good == 'yes':
          response = s
        if response not in id_to_sents[qid]:
          id_to_sents[qid].append(response)
  return id_to_sents

Source File: test.py From pyRevit with GNU General Public License v3.0

5 votes

def test_read_dict_fieldnames_chain(self):
        import itertools
        fd, name = tempfile.mkstemp()
        f = os.fdopen(fd, "w+b")
        try:
            f.write(b"f1,f2,f3\r\n1,2,abc\r\n")
            f.seek(0)
            reader = csv.DictReader(f)
            first = next(reader)
            for row in itertools.chain([first], reader):
                self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
                self.assertEqual(row, {"f1": '1', "f2": '2', "f3": 'abc'})
        finally:
            f.close()
            os.unlink(name)

Source File: print_batch.py From adversarial-squad with MIT License

5 votes

def stats_verify():
  # Read data
  worker_to_is_good = collections.defaultdict(list)
  worker_to_times = collections.defaultdict(list)
  with open(OPTS.filename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      if row['AssignmentStatus'] == 'Rejected': continue
      worker_id = row['WorkerId']
      ans_is_good = [x for s in row['Answer.responses'].split('\t')
                     for x in s.split('|')]
      time = float(row['WorkTimeInSeconds'])
      worker_to_is_good[worker_id].extend(ans_is_good)
      worker_to_times[worker_id].append(time)

  # Aggregate by worker
  print '%d total workers' % len(worker_to_times)
  worker_stats = {}
  for worker_id in worker_to_times:
    times = sorted(worker_to_times[worker_id])
    t_median = times[len(times)/2]
    t_mean = sum(times) / float(len(times))
    is_good_list = worker_to_is_good[worker_id]
    num_qs = len(is_good_list)
    frac_good = sum(1.0 for x in is_good_list if x == 'yes') / num_qs
    worker_stats[worker_id] = (t_median, t_mean, num_qs, frac_good)

  # Print
  sorted_ids = sorted(list(worker_stats), key=lambda x: worker_stats[x][3])
  for worker_id in sorted_ids:
    t_median, t_mean, num_qs, frac_good = worker_stats[worker_id]
    print 'Worker %s: t_median %.1f, t_mean %.1f, %d questions, %.1f%% good' % (
        worker_id, t_median, t_mean, num_qs, 100.0 * frac_good)

Source File: sos_parser.py From openelections-data-or with MIT License

5 votes

def fetch_state_senate_races():
    f = open('state_senate_2016.csv', 'r')
    reader = unicodecsv.DictReader(f, encoding='utf-8')
    return [row for row in reader]

Source File: sos_parser.py From openelections-data-or with MIT License

5 votes

def fetch_state_house_races():
    f = open('state_house_2016.csv', 'r')
    reader = unicodecsv.DictReader(f, encoding='utf-8')
    return [row for row in reader]

Source File: standardizer.py From 990-xml-reader with MIT License

5 votes

def _make_groups(self):
        group_filepath = os.path.join(METADATA_DIRECTORY, 'groups.csv')
        with open(group_filepath, 'r') as reader_fh:
            reader = csv.DictReader(reader_fh)
            for row in reader:
                self.groups[row['xpath']] = row
        return True

Source File: standardizer.py From 990-xml-reader with MIT License

5 votes

def _make_variables(self):
        variable_filepath = os.path.join(METADATA_DIRECTORY, 'variables.csv')
        with open(variable_filepath, 'r') as variable_fh:
            reader = csv.DictReader(variable_fh)
            for row in reader:
                vardict = {}
                for col in self.variable_columns:
                    vardict[col]=row[col]
                self.variables[row['xpath']] = vardict


        return True

Source File: standardizer.py From 990-xml-reader with MIT License

5 votes

def _make_schedule_parts(self):
        part_filepath = os.path.join(METADATA_DIRECTORY, 'schedule_parts.csv')
        with open(part_filepath, 'r') as reader_fh:
            reader = csv.DictReader(reader_fh)
            for row in reader:
                self.schedule_parts[row['parent_sked_part']] = {
                    'name': row['part_name'],
                    'ordering': row['ordering'],
                    'parent_sked': row['parent_sked'],
                    'parent_sked_part': row['parent_sked_part'],
                    'is_shell': row['is_shell']

                }
        return True

Source File: standardizer.py From 990-xml-reader with MIT License

5 votes

def _make_line_numbers(self):
        filepath = os.path.join(METADATA_DIRECTORY, 'line_numbers.csv')
        with open(filepath, 'r') as reader_fh:
            reader = csv.DictReader(reader_fh)

            for row in reader:
                try:
                    self.line_numbers[row['xpath']]
                    self.line_numbers[row['xpath']].append(row)

                except KeyError:
                    self.line_numbers[row['xpath']] = [row]

Source File: standardizer.py From 990-xml-reader with MIT License

5 votes

def _make_descriptions(self):
        filepath = os.path.join(METADATA_DIRECTORY, 'descriptions.csv')
        with open(filepath, 'r') as reader_fh:
            reader = csv.DictReader(reader_fh)

            for row in reader:
                try:
                    self.descriptions[row['xpath']]
                    self.descriptions[row['xpath']].append(row)

                except KeyError:
                    self.descriptions[row['xpath']] = [row]

Source File: import_geonames.py From EpiTator with Apache License 2.0

5 votes

def read_geonames_csv():
    print("Downloading geoname data from: " + GEONAMES_ZIP_URL)
    try:
        url = request.urlopen(GEONAMES_ZIP_URL)
    except URLError:
        print("If you are operating behind a firewall, try setting the HTTP_PROXY/HTTPS_PROXY environment variables.")
        raise
    zipfile = ZipFile(BytesIO(url.read()))
    print("Download complete")
    # Loading geonames data may cause errors without setting csv.field_size_limit:
    if sys.platform == "win32":
        max_c_long_on_windows = (2**32 / 2) - 1
        csv.field_size_limit(max_c_long_on_windows)
    else:
        csv.field_size_limit(sys.maxint if six.PY2 else six.MAXSIZE)
    with zipfile.open('allCountries.txt') as f:
        reader = unicodecsv.DictReader(f,
                                       fieldnames=[
                                           k for k, v in geonames_field_mappings],
                                       encoding='utf-8',
                                       delimiter='\t',
                                       quoting=csv.QUOTE_NONE)
        for d in reader:
            d['population'] = parse_number(d['population'], 0)
            d['latitude'] = parse_number(d['latitude'], 0)
            d['longitude'] = parse_number(d['longitude'], 0)
            if len(d['alternatenames']) > 0:
                d['alternatenames'] = d['alternatenames'].split(',')
            else:
                d['alternatenames'] = []
            yield d

Source File: load_course_sql.py From edx2bigquery with GNU General Public License v2.0

5 votes

def rephrase_studentmodule_opaque_keys(fn_sm):
    '''
    Generate rephrased studentmodule, with opaque key entries for module_id and course_id translated
    into traditional format.
    '''
    fn_sm = path(fn_sm)
    orig_sm_fn = '%s/studentmodule_orig.csv.gz' % (fn_sm.dirname())
    cmd = 'cp %s %s' % (fn_sm, orig_sm_fn)
    print "  Running %s" % cmd
    sys.stdout.flush()
    os.system(cmd)
    ofp = openfile(fn_sm, 'w')
    smfp = openfile(orig_sm_fn)
    cdr = csv.DictReader(smfp)
    first = True
    for entry in cdr:
        if first:
            odw = csv.DictWriter(ofp, fieldnames=cdr.fieldnames)
            odw.writeheader()
            first = False
        fix_opaque_keys(entry, 'module_id')
        fix_opaque_keys(entry, 'course_id')
        odw.writerow(entry)
    ofp.close()
    print "Rephrased %s -> %s to convert opaque keys syntax to standard module_id and course_id format" % (orig_sm_fn, fn_sm)
    sys.stdout.flush()

#-----------------------------------------------------------------------------

Source File: make_person_course.py From edx2bigquery with GNU General Public License v2.0

5 votes

def load_csv(self, fn, key, schema=None, multi=False, fields=None, keymap=None, useCourseDir=True ):
        '''
        load csv file into memory, storing into dict with specified field (key) as the key.
        if multi, then each dict value is a list, with one or more values per key.
    
        if fields, load only those specified fields.
        '''
        data = OrderedDict()
        if keymap is None:
            keymap = lambda x: x
        for line in csv.DictReader(self.openfile(fn, useCourseDir=useCourseDir)):
            try:
                the_id = keymap(line[key])
            except Exception as err:
                self.log("oops, failed to do keymap, key=%s, line=%s" % (line[key], line))
                raise
            if fields:
                newline = { x: line[x] for x in fields }
                line = newline
            if multi:
                if the_id in data:
                    data[the_id].append(line)
                else:
                    data[the_id] = [ line ]
            else:
                data[the_id] = line
        return data

Source File: test_hgvs_variantmapper_gcp.py From hgvs with Apache License 2.0

5 votes

def gxp_file_reader(fn):
    rdr = csv.DictReader(open(fn, "r"), delimiter=str("\t"))
    for rec in rdr:
        if rec["id"].startswith("#"):
            continue
        yield rec

Source File: test_hgvs_grammar_full.py From hgvs with Apache License 2.0

5 votes

def test_parser_test_completeness(self):
        """ensure that all rules in grammar have tests"""

        grammar_rule_re = re.compile(r"^(\w+)")
        grammar_fn = pkg_resources.resource_filename("hgvs", "_data/hgvs.pymeta")
        with open(grammar_fn, "r") as f:
            grammar_rules = set(r.group(1) for r in filter(None, map(grammar_rule_re.match, f)))

        with open(self._test_fn, "r") as f:
            reader = csv.DictReader(f, delimiter=str("\t"))
            test_rules = set(row["Func"] for row in reader)

        untested_rules = grammar_rules - test_rules

        self.assertTrue(len(untested_rules) == 0, "untested rules: {}".format(untested_rules))

Source File: test_hgvs_grammar_full.py From hgvs with Apache License 2.0

5 votes

def test_parser_grammar(self):
        with open(self._test_fn, "r") as f:
            reader = csv.DictReader(f, delimiter=str("\t"))

            fail_cases = []

            for row in reader:
                if row["Func"].startswith("#"):
                    continue

                # setup input
                inputs = self._split_inputs(row["Test"], row["InType"])
                expected_results = self._split_inputs(row["Expected"],
                                                      row["InType"]) if row["Expected"] else inputs
                expected_map = dict(zip(inputs, expected_results))
                # step through each item and check
                is_valid = True if row["Valid"].lower() == "true" else False

                for key in expected_map:
                    expected_result = six.text_type(expected_map[key]).replace("u'", "'")
                    function_to_test = getattr(self.p._grammar(key), row["Func"])
                    row_str = u"{}\t{}\t{}\t{}\t{}".format(row["Func"], key, row["Valid"], "one",
                                                           expected_result)
                    try:
                        actual_result = six.text_type(function_to_test()).replace("u'", "'")
                        if not is_valid or (expected_result != actual_result):
                            print("expected: {} actual:{}".format(expected_result, actual_result))
                            fail_cases.append(row_str)
                    except Exception as e:
                        if is_valid:
                            print("expected: {} Exception: {}".format(expected_result, e))
                            fail_cases.append(row_str)

        # everything should have passed - report whatever failed
        self.assertTrue(len(fail_cases) == 0, pprint.pprint(fail_cases))

Python unicodecsv.DictReader() Examples