Python Examples of pandas.read

Source File: test_json_table_schema.py From recruit with Apache License 2.0

6 votes

def test_comprehensive(self):
        df = DataFrame(
            {'A': [1, 2, 3, 4],
             'B': ['a', 'b', 'c', 'c'],
             'C': pd.date_range('2016-01-01', freq='d', periods=4),
             # 'D': pd.timedelta_range('1H', periods=4, freq='T'),
             'E': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'])),
             'F': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'],
                                           ordered=True)),
             'G': [1.1, 2.2, 3.3, 4.4],
             # 'H': pd.date_range('2016-01-01', freq='d', periods=4,
             #                   tz='US/Central'),
             'I': [True, False, False, True],
             },
            index=pd.Index(range(4), name='idx'))

        out = df.to_json(orient="table")
        result = pd.read_json(out, orient="table")
        tm.assert_frame_equal(df, result)

Source File: sql_dash_dropdown.py From dash-recipes with MIT License

6 votes

def dff_to_table(dff_json, dropdown_x, dropdown_y):
    dff = pd.read_json(dff_json)
    return {
        'data': [{
            'x': dff[dropdown_x],
            'y': dff[dropdown_y],
            'type': 'bar'
        }],
        'layout': {
            'margin': {
                'l': 20,
                'r': 10,
                'b': 60,
                't': 10
            }
        }
    }

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_date_format_frame(self):
        df = self.tsframe.copy()

        def test_w_date(date, date_unit=None):
            df['date'] = Timestamp(date)
            df.iloc[1, df.columns.get_loc('date')] = pd.NaT
            df.iloc[5, df.columns.get_loc('date')] = pd.NaT
            if date_unit:
                json = df.to_json(date_format='iso', date_unit=date_unit)
            else:
                json = df.to_json(date_format='iso')
            result = read_json(json)
            assert_frame_equal(result, df)

        test_w_date('20130101 20:43:42.123')
        test_w_date('20130101 20:43:42', date_unit='s')
        test_w_date('20130101 20:43:42.123', date_unit='ms')
        test_w_date('20130101 20:43:42.123456', date_unit='us')
        test_w_date('20130101 20:43:42.123456789', date_unit='ns')

        msg = "Invalid value 'foo' for option 'date_unit'"
        with pytest.raises(ValueError, match=msg):
            df.to_json(date_format='iso', date_unit='foo')

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_v12_compat(self):
        df = DataFrame(
            [[1.56808523, 0.65727391, 1.81021139, -0.17251653],
             [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
             [1.51493992, 0.11805825, 1.629455, -1.31506612],
             [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
             [0.05951614, -2.69652057, 1.28163262, 0.34703478]],
            columns=['A', 'B', 'C', 'D'],
            index=pd.date_range('2000-01-03', '2000-01-07'))
        df['date'] = pd.Timestamp('19920106 18:21:32.12')
        df.iloc[3, df.columns.get_loc('date')] = pd.Timestamp('20130101')
        df['modified'] = df['date']
        df.iloc[1, df.columns.get_loc('modified')] = pd.NaT

        v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
        df_unser = pd.read_json(v12_json)
        assert_frame_equal(df, df_unser)

        df_iso = df.drop(['modified'], axis=1)
        v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
        df_unser_iso = pd.read_json(v12_iso_json)
        assert_frame_equal(df_iso, df_unser_iso)

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_date_format_series(self):
        def test_w_date(date, date_unit=None):
            ts = Series(Timestamp(date), index=self.ts.index)
            ts.iloc[1] = pd.NaT
            ts.iloc[5] = pd.NaT
            if date_unit:
                json = ts.to_json(date_format='iso', date_unit=date_unit)
            else:
                json = ts.to_json(date_format='iso')
            result = read_json(json, typ='series')
            assert_series_equal(result, ts)

        test_w_date('20130101 20:43:42.123')
        test_w_date('20130101 20:43:42', date_unit='s')
        test_w_date('20130101 20:43:42.123', date_unit='ms')
        test_w_date('20130101 20:43:42.123456', date_unit='us')
        test_w_date('20130101 20:43:42.123456789', date_unit='ns')

        ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
        msg = "Invalid value 'foo' for option 'date_unit'"
        with pytest.raises(ValueError, match=msg):
            ts.to_json(date_format='iso', date_unit='foo')

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_date_unit(self):
        df = self.tsframe.copy()
        df['date'] = Timestamp('20130101 20:43:42')
        dl = df.columns.get_loc('date')
        df.iloc[1, dl] = Timestamp('19710101 20:43:42')
        df.iloc[2, dl] = Timestamp('21460101 20:43:42')
        df.iloc[4, dl] = pd.NaT

        for unit in ('s', 'ms', 'us', 'ns'):
            json = df.to_json(date_format='epoch', date_unit=unit)

            # force date unit
            result = read_json(json, date_unit=unit)
            assert_frame_equal(result, df)

            # detect date unit
            result = read_json(json, date_unit=None)
            assert_frame_equal(result, df)

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_weird_nested_json(self):
        # this used to core dump the parser
        s = r'''{
        "status": "success",
        "data": {
        "posts": [
            {
            "id": 1,
            "title": "A blog post",
            "body": "Some useful content"
            },
            {
            "id": 2,
            "title": "Another blog post",
            "body": "More content"
            }
           ]
          }
        }'''

        read_json(s)

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_misc_example(self):

        # parsing unordered input fails
        result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])

        error_msg = """DataFrame\\.index are different

DataFrame\\.index values are different \\(100\\.0 %\\)
\\[left\\]:  Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
        with pytest.raises(AssertionError, match=error_msg):
            assert_frame_equal(result, expected, check_index_type=False)

        result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        assert_frame_equal(result, expected)

Source File: test_readlines.py From recruit with Apache License 2.0

6 votes

def test_readjson_chunks_multiple_empty_lines(chunksize):
    j = """

    {"A":1,"B":4}



    {"A":2,"B":5}







    {"A":3,"B":6}
    """
    orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
    test = pd.read_json(j, lines=True, chunksize=chunksize)
    if chunksize is not None:
        test = pd.concat(test)
    tm.assert_frame_equal(
        orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))

Source File: utils_mnli.py From interpret-text with MIT License

6 votes

def load_mnli_pandas_df(local_cache_path=".", file_split="train"):
    """Loads extracted test_utils into pandas
    Args:
        local_cache_path ([type], optional): [description].
            Defaults to current working directory.
        file_split (str, optional): The subset to load.
            One of: {"train", "dev_matched", "dev_mismatched"}
            Defaults to "train".
    Returns:
        pd.DataFrame: pandas DataFrame containing the specified
            MultiNLI subset.
    """
    try:
        download_file_and_extract(local_cache_path, file_split)
    except Exception as e:
        raise e
    return pd.read_json(
        os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True
    )

Source File: run_han.py From DeepResearch with MIT License

6 votes

def main():
    """
    A small tutorial to use HAN module
    """
    filename = './News_Category_Dataset/News_Category_Dataset.json'
    df = pd.read_json(filename, lines=True).reset_index()
    df = preprocessing(df)
    han_network = HAN.HAN(text = df.text, labels = df.category, num_categories = 30, pretrained_embedded_vector_path = './glove.6B/glove.6B.100d.txt', max_features = 200000, max_senten_len = 150, max_senten_num = 4 , embedding_size = 100, validation_split=0.2, verbose=1)
    print(han_network.get_model().summary())
    han_network.show_hyperparameters()
    ## How to change hyperparameters
    # Let's add regularizers
    # To replace a hyperparameter change the corresponding key value to the new value in set_hyperparameters
    han_network.set_hyperparameters({'l2_regulizer': 1e-13, 'dropout_regulizer': 0.5})
    han_network.show_hyperparameters()
    print(han_network.get_model().summary())
    han_network.train_model(epochs=3, batch_size=16,
                            best_model_path='./best_model.h5')

Source File: test_readlines.py From recruit with Apache License 2.0

6 votes

def test_to_jsonl():
    # GH9180
    df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
    assert result == expected

    df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
    assert result == expected
    assert_frame_equal(read_json(result, lines=True), df)

    # GH15096: escaped characters in columns and data
    df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
                   columns=["a\\", 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
                '{"a\\\\":"foo\\"","b":"bar"}')
    assert result == expected
    assert_frame_equal(read_json(result, lines=True), df)

Source File: test_readlines.py From recruit with Apache License 2.0

6 votes

def test_read_jsonl_unicode_chars():
    # GH15132: non-ascii unicode characters
    # \u201d == RIGHT DOUBLE QUOTATION MARK

    # simulate file handle
    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
    json = StringIO(json)
    result = read_json(json, lines=True)
    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                         columns=['a', 'b'])
    assert_frame_equal(result, expected)

    # simulate string
    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
    result = read_json(json, lines=True)
    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                         columns=['a', 'b'])
    assert_frame_equal(result, expected)

Source File: test_readlines.py From vnpy_crypto with MIT License

6 votes

def test_read_jsonl_unicode_chars():
    # GH15132: non-ascii unicode characters
    # \u201d == RIGHT DOUBLE QUOTATION MARK

    # simulate file handle
    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
    json = StringIO(json)
    result = read_json(json, lines=True)
    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                         columns=['a', 'b'])
    assert_frame_equal(result, expected)

    # simulate string
    json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
    result = read_json(json, lines=True)
    expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                         columns=['a', 'b'])
    assert_frame_equal(result, expected)

Source File: test_readlines.py From vnpy_crypto with MIT License

6 votes

def test_to_jsonl():
    # GH9180
    df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
    assert result == expected

    df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
    assert result == expected
    assert_frame_equal(read_json(result, lines=True), df)

    # GH15096: escaped characters in columns and data
    df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
                   columns=["a\\", 'b'])
    result = df.to_json(orient="records", lines=True)
    expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
                '{"a\\\\":"foo\\"","b":"bar"}')
    assert result == expected
    assert_frame_equal(read_json(result, lines=True), df)

Source File: test_pandas.py From recruit with Apache License 2.0

6 votes

def test_read_jsonl_unicode_chars(self):
        # GH15132: non-ascii unicode characters
        # \u201d == RIGHT DOUBLE QUOTATION MARK

        # simulate file handle
        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
        json = StringIO(json)
        result = read_json(json, lines=True)
        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                             columns=['a', 'b'])
        assert_frame_equal(result, expected)

        # simulate string
        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
        result = read_json(json, lines=True)
        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
                             columns=['a', 'b'])
        assert_frame_equal(result, expected)

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_read_zipped_json(datapath):
    uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
    uncompressed_df = pd.read_json(uncompressed_path)

    compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
    compressed_df = pd.read_json(compressed_path, compression='zip')

    assert_frame_equal(uncompressed_df, compressed_df)

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_read_unsupported_compression_type():
    with tm.ensure_clean() as path:
        msg = "Unrecognized compression type: unsupported"
        with pytest.raises(ValueError, match=msg):
            pd.read_json(path, compression="unsupported")

Source File: exploredata.py From MSMARCO with MIT License

5 votes

def general_stats_data_public(path):
    df = pd.read_json(path)
    query_type_label = {'LOCATION': 0, 'DESCRIPTION':0, 'NUMERIC':0, 'ENTITY':0, 'PERSON':0}
    total_size = len(df)
    for row in df.iterrows():
        category = row[1]['query_type']
        if category in query_type_label:
            query_type_label[category] += 1
    print('Columns:{}'.format(df.columns.values))
    print('{} queries'.format(total_size))
    print('----query distribution by dataset type----')
    for key in query_type_label:
        print(key + ',' + str(query_type_label[key])+ ',' + str(query_type_label[key]/total_size))

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_write_unsupported_compression_type():
    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
    with tm.ensure_clean() as path:
        msg = "Unrecognized compression type: unsupported"
        with pytest.raises(ValueError, match=msg):
            df.to_json(path, compression="unsupported")

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_lines_with_compression(compression):

    with tm.ensure_clean() as path:
        df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
        df.to_json(path, orient='records', lines=True,
                   compression=compression)
        roundtripped_df = pd.read_json(path, lines=True,
                                       compression=compression)
        assert_frame_equal(df, roundtripped_df)

Source File: get_stats_about_length.py From MSMARCO with MIT License

5 votes

def main():
    file = sys.argv[1]
    df = pd.read_json(file)
    queries = {}
    answers = {}
    well_formed_answers = {}
    passages = {}
    
    for row in df.iterrows():
        queries[row[1]['query']] = 1
        for v in row[1]['answers']:
            answers[v] = 1
        for v in row[1]['wellFormedAnswers']:
            well_formed_answers[v] = 1
        for p in row[1]['passages']:
            passages[p['passage_text']] = 1
    data = {'queries' : queries, 'answers' : answers, 'well_formed_answers' : well_formed_answers, 'passages' : passages}
    for value in data:

        histogram = {}
        for v in data[value]:
            l = len(v.split())
            if l in histogram:
                histogram[l] += 1
            else:
                histogram[l] = 1
        compute_stats(histogram, value)

Source File: converttowellformed.py From MSMARCO with MIT License

5 votes

def makewf(input,output):
    df = pd.read_json(input)
    df = df.drop('answers',1)
    df = df.rename(columns={'wellFormedAnswers':'answers'})
    df = df[df.answers != '[]']
    df.to_json(output)
    return

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_with_s3_url(compression, s3_resource):
    # Bucket "pandas-test" created in tests/io/conftest.py

    df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')

    with tm.ensure_clean() as path:
        df.to_json(path, compression=compression)
        with open(path, 'rb') as f:
            s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f)

    roundtripped_df = pd.read_json('s3://pandas-test/test-1',
                                   compression=compression)
    assert_frame_equal(df, roundtripped_df)

Source File: test_pandas.py From recruit with Apache License 2.0

5 votes

def test_index_false_from_json_to_json(self, orient, index):
        # GH25170
        # Test index=False in from_json to_json
        expected = DataFrame({'a': [1, 2], 'b': [3, 4]})
        dfjson = expected.to_json(orient=orient, index=index)
        result = read_json(dfjson, orient=orient)
        assert_frame_equal(result, expected)

Source File: test_compression.py From recruit with Apache License 2.0

5 votes

def test_compression_roundtrip(compression):
    df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
                       [12.32112, 123123.2, 321321.2]],
                      index=['A', 'B'], columns=['X', 'Y', 'Z'])

    with tm.ensure_clean() as path:
        df.to_json(path, compression=compression)
        assert_frame_equal(df, pd.read_json(path,
                                            compression=compression))

        # explicitly ensure file was compressed.
        with tm.decompress_file(path, compression) as fh:
            result = fh.read().decode('utf8')
        assert_frame_equal(df, pd.read_json(result))

Source File: test_json_table_schema.py From recruit with Apache License 2.0

5 votes

def test_empty_frame_roundtrip(self, strict_check):
        # GH 21287
        df = pd.DataFrame([], columns=['a', 'b', 'c'])
        expected = df.copy()
        out = df.to_json(orient='table')
        result = pd.read_json(out, orient='table')
        # TODO: When DF coercion issue (#21345) is resolved tighten type checks
        tm.assert_frame_equal(expected, result,
                              check_dtype=strict_check,
                              check_index_type=strict_check)

Source File: test_json_table_schema.py From recruit with Apache License 2.0

5 votes

def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
        df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
        out = df.to_json(orient="table")
        with pytest.raises(NotImplementedError, match='can not yet read '):
            pd.read_json(out, orient="table")

Source File: test_json_table_schema.py From recruit with Apache License 2.0

5 votes

def test_read_json_table_orient(self, index_nm, vals, recwarn):
        df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
        out = df.to_json(orient="table")
        result = pd.read_json(out, orient="table")
        tm.assert_frame_equal(df, result)

Source File: test_pandas.py From recruit with Apache License 2.0

5 votes

def test_read_inline_jsonl(self):
        # GH9180
        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        assert_frame_equal(result, expected)

Python pandas.read_json() Examples