Kernel: Python 3
In [1]:
import numpy as np import pandas as pd np.random.seed(12345) import matplotlib.pyplot as plt plt.rc("figure", figsize=(10, 6)) pd.options.display.max_colwidth = 75 pd.options.display.max_columns = 20 np.set_printoptions(precision=4, suppress=True)
In [2]:
!cat examples/ex1.csv
In [3]:
df = pd.read_csv("examples/ex1.csv") df
In [4]:
!cat examples/ex2.csv
In [5]:
pd.read_csv("examples/ex2.csv", header=None) pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])
In [6]:
names = ["a", "b", "c", "d", "message"] pd.read_csv("examples/ex2.csv", names=names, index_col="message")
In [7]:
!cat examples/csv_mindex.csv parsed = pd.read_csv("examples/csv_mindex.csv", index_col=["key1", "key2"]) parsed
In [8]:
!cat examples/ex3.txt
In [9]:
result = pd.read_csv("examples/ex3.txt", sep="\s+") result
In [10]:
!cat examples/ex4.csv pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])
In [11]:
!cat examples/ex5.csv result = pd.read_csv("examples/ex5.csv") result
In [12]:
pd.isna(result)
In [13]:
result = pd.read_csv("examples/ex5.csv", na_values=["NULL"]) result
In [14]:
result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False) result2 result2.isna() result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False, na_values=["NA"]) result3 result3.isna()
In [15]:
sentinels = {"message": ["foo", "NA"], "something": ["two"]} pd.read_csv("examples/ex5.csv", na_values=sentinels, keep_default_na=False)
In [16]:
pd.options.display.max_rows = 10
In [17]:
result = pd.read_csv("examples/ex6.csv") result
In [18]:
pd.read_csv("examples/ex6.csv", nrows=5)
In [19]:
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000) type(chunker)
In [20]:
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000) tot = pd.Series([], dtype='int64') for piece in chunker: tot = tot.add(piece["key"].value_counts(), fill_value=0) tot = tot.sort_values(ascending=False)
In [21]:
tot[:10]
In [22]:
data = pd.read_csv("examples/ex5.csv") data
In [23]:
data.to_csv("examples/out.csv") !cat examples/out.csv
In [24]:
import sys data.to_csv(sys.stdout, sep="|")
In [25]:
data.to_csv(sys.stdout, na_rep="NULL")
In [26]:
data.to_csv(sys.stdout, index=False, header=False)
In [27]:
data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"])
In [28]:
!cat examples/ex7.csv
In [29]:
import csv f = open("examples/ex7.csv") reader = csv.reader(f)
In [30]:
for line in reader: print(line) f.close()
In [31]:
with open("examples/ex7.csv") as f: lines = list(csv.reader(f))
In [32]:
header, values = lines[0], lines[1:]
In [33]:
data_dict = {h: v for h, v in zip(header, zip(*values))} data_dict
In [34]:
obj = """ {"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}] } """
In [35]:
import json result = json.loads(obj) result
In [36]:
asjson = json.dumps(result) asjson
In [37]:
siblings = pd.DataFrame(result["siblings"], columns=["name", "age"]) siblings
In [38]:
!cat examples/example.json
In [39]:
data = pd.read_json("examples/example.json") data
In [40]:
data.to_json(sys.stdout) data.to_json(sys.stdout, orient="records")
In [41]:
tables = pd.read_html("examples/fdic_failed_bank_list.html") len(tables) failures = tables[0] failures.head()
In [42]:
close_timestamps = pd.to_datetime(failures["Closing Date"]) close_timestamps.dt.year.value_counts()
In [43]:
from lxml import objectify path = "datasets/mta_perf/Performance_MNR.xml" with open(path) as f: parsed = objectify.parse(f) root = parsed.getroot()
In [44]:
data = [] skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ", "DESIRED_CHANGE", "DECIMAL_PLACES"] for elt in root.INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data)
In [45]:
perf = pd.DataFrame(data) perf.head()
In [46]:
perf2 = pd.read_xml(path) perf2.head()
In [47]:
frame = pd.read_csv("examples/ex1.csv") frame frame.to_pickle("examples/frame_pickle")
In [48]:
pd.read_pickle("examples/frame_pickle")
In [49]:
!rm examples/frame_pickle
In [50]:
fec = pd.read_parquet('datasets/fec/fec.parquet')
In [51]:
xlsx = pd.ExcelFile("examples/ex1.xlsx")
In [52]:
xlsx.sheet_names
In [53]:
xlsx.parse(sheet_name="Sheet1")
In [54]:
xlsx.parse(sheet_name="Sheet1", index_col=0)
In [55]:
frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1") frame
In [56]:
writer = pd.ExcelWriter("examples/ex2.xlsx") frame.to_excel(writer, "Sheet1") writer.close()
In [57]:
frame.to_excel("examples/ex2.xlsx")
In [58]:
!rm examples/ex2.xlsx
In [59]:
!rm -f examples/mydata.h5
In [60]:
frame = pd.DataFrame({"a": np.random.standard_normal(100)}) store = pd.HDFStore("examples/mydata.h5") store["obj1"] = frame store["obj1_col"] = frame["a"] store
In [61]:
store["obj1"]
In [62]:
store.put("obj2", frame, format="table") store.select("obj2", where=["index >= 10 and index <= 15"]) store.close()
In [63]:
frame.to_hdf("examples/mydata.h5", "obj3", format="table") pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])
In [64]:
import os os.remove("examples/mydata.h5")
In [65]:
import requests url = "https://api.github.com/repos/pandas-dev/pandas/issues" resp = requests.get(url) resp.raise_for_status() resp
In [66]:
data = resp.json() data[0]["title"]
In [67]:
issues = pd.DataFrame(data, columns=["number", "title", "labels", "state"]) issues
In [68]:
import sqlite3 query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );""" con = sqlite3.connect("mydata.sqlite") con.execute(query) con.commit()
In [69]:
data = [("Atlanta", "Georgia", 1.25, 6), ("Tallahassee", "Florida", 2.6, 3), ("Sacramento", "California", 1.7, 5)] stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" con.executemany(stmt, data) con.commit()
In [70]:
cursor = con.execute("SELECT * FROM test") rows = cursor.fetchall() rows
In [71]:
cursor.description pd.DataFrame(rows, columns=[x[0] for x in cursor.description])
In [72]:
import sqlalchemy as sqla db = sqla.create_engine("sqlite:///mydata.sqlite") pd.read_sql("SELECT * FROM test", db)
In [73]:
!rm mydata.sqlite
In [74]: