[Python][Library] 2 Pandas - 2. 파일 입출력

#!/usr/bin/env python

# coding: utf-8

# # Pandas II: 데이터 로딩, 저장, 파일 형식

# ### 입∙출력 방법

# - 텍스트 파일 이용하는 방법

# - 웹 API 이용해서 네트워크를 통해 불러오는 방법

# - 데이터베이스 이용하는 방법

# ---

# # 1. 텍스트 파일

# #### pandas 파일 파싱 함수

# 함수 | 설명

# :---|:---

# read_csv | 파일, URL 또는 파일과 유사한 객체로부터 구분된 데이터를 읽어온다. 데이터 구분자는 쉼표(,)를 기본으로 한다.

# read_table | 파일, URL 또는 파일과 유사한 객체로부터 구분된 데이터를 읽어온다. 데이터 구분자는 탭('\t')을 기본으로 한다.

# read_fwf | 고정폭 칼럼 형식에서 데이터를 읽어온다(구분자가 없는 데이터)

# read_clipboard | 클립보드에 있는 데이터를 읽어오는 read_table 함수. 웹페이지에서 표를 긁어올 때 유용하다.

# #### pandas 파일 파싱 함수 옵션

# - **색인**: 반환하는 DataFrame에서 하나 이상의 칼럼을 색인으로 지정할 수 있다. 파일이나 사용자로부터 칼럼의 이름을 받거나 아무것도 받지 않을 수 있다.

# - **자료형 추론과 데이터 변환**: 사용자 정의 값 변환과 비어있는 값을 위한 사용자 리스트를 포함한다.

# - **날짜 분석**: 여러 칼럼에 걸쳐 있는 날짜와 시간 정보를 하나의 칼럼에 조합해서 결과에 반영한다.

# - **반복**: 여러 파일에 걸쳐 있는 자료를 반복적으로 읽어올 수 있다.

# - **정제되지 않는 데이터 처리**: 로우나 꼬리말, 주석 건너뛰기 또는 천 단위마다 쉼표로 구분된 숫자 같은 사소한 일을 처리해준다.

# ### 1.1 텍스트 파일 읽기

# In[ ]:

from pandas import DataFrame, Series

import pandas as pd

import numpy as np

# In[ ]:

#!cat data/ex1.csv

get_ipython().system('type data\\ex1.csv')

# In[ ]:

df = pd.read_csv('data/ex1.csv')

# In[ ]:

# 컬럼명 자동생성

pd.read_csv('data/ex1.csv', header=None)

# In[ ]:

# 컬럼명 설정

pd.read_csv('data/ex1.csv', names=['a1', 'b1', 'c1', 'd1', 'message1'])

# In[ ]:

pd.read_table('data/ex1.csv', sep=',')

# In[ ]:

# 컬럼 message를 index 컬럼으로 사용

pd.read_csv('data/ex1.csv', index_col='message')

# In[ ]:

parsed = pd.read_csv('data/csv_mindex.csv')

parsed

# #### 계층적 색인을 지정

# In[ ]:

parsed = pd.read_csv('data/csv_mindex.csv', index_col=['key1', 'key2'])

parsed

# #### 구분자로 정규표현식 사용가능

# - [파이썬 - 정규표현식 모듈](http://devanix.tistory.com/296)

# - [tutorial point](http://www.tutorialspoint.com/python/python_reg_expressions.htm)

# In[ ]:

#!cat data/ex3.txt

get_ipython().system('type data\\ex3.txt')

# In[ ]:

list(open('data/ex3.txt')) #리스트형식으로 txt파일 읽기: newline 처리 필요

# In[ ]:

result = pd.read_table('data/ex3.txt', sep='\s+')

result

# [Pandas Documents Page: Text, CSV, HDF5, ⋯](http://pandas.pydata.org/pandas-docs/stable/io.html)

# In[ ]:

#!cat data/ex4.csv

get_ipython().system('type data\\ex4.csv')

# In[ ]:

pd.read_csv('data/ex4.csv')

# In[ ]:

pd.read_csv('data/ex4.csv', skiprows=[0,2,3])

# In[ ]:

#!cat data/ex5.csv

get_ipython().system('type data\\ex5.csv')

# In[ ]:

result = pd.read_csv('data/ex5.csv')

result

# In[ ]:

#pd.isnull(result)

result.isnull()

# #### 결측치 문자열 정의

# In[ ]:

result = pd.read_csv('data/ex5.csv', na_values=["world",12, "one"])

result

# In[ ]:

# 해당 열마다 다른 NaN 리스트를 지정

sentinels = {'message': ['foo', 'NA'], 'something': ['two']}

# In[ ]:

pd.read_csv('data/ex5.csv', na_values=sentinels)

# #### read_csv / read_table 함수 인자

# 인자 | 설명

# :---|:---

# path | 파일 시스템에서의 위치, URL, 파일 객체를 나타내는 문자열

# sep or delimiter | 필드를 구분하기 위해 사용할 연속된 문자나 정규표현식

# header | 칼럼의 이름으로 사용할 로우의 번호, 기본 값은 0(첫 로우)이며 헤더가 없으면 None으로 지정할 수 있다.

# index_col | 색인으로 사용할 칼럼 번호나 이름, 계층적 색인을 지정할 경우 리스트를 넘길 수 있다.

# names | 컬럼 이름으로 사용할 리스트. header = None과 함께 사용한다.

# skiprows | 파일의 시작부터 무시할 로우의 개수 또는 무시할 로우 번호가 담긴 리스트

# na_values | NA 값으로 처리할 값들의 나열

# comment | 주석으로 분류되어 파싱하지 않을 문자 혹은 문자열

# parse_dates | 날짜를 datetime으로 변환할지의 여부. 기본값은 False이며, True일 경우 모든 칼럼에 다 적용된다. 리스트를 넘기면 변환할 칼럼을 지정할 수 있는데, [1, 2, 3]을 넘기면 각각의 칼럼을 datetime으로 변환하고, [[1, 3]]을 넘기면 1, 3번 칼럼을 조합해서 하나의 datetime으로 변환한다.

# keep_date_col | 여러 칼럼을 datetime으로 변환했을 경우 원래 칼럼을 남겨둘지의 여부. 기본값은 False

# converters | 변환 시 칼럼에 적용할 함수를 지정한다. 예를 들어 {'foo': f}는 'foo'칼럼에 f 함수를 적용한다. 전달하는 사전의 키 값은 칼럼 이름이나 번호가 될 수 있다.

# dayfirst | 모호한 날짜 형식일 경우 국제 형식으로 간주한다(7/6/2012는 2012년 6월 7일로 간주한다). 기본값은 False

# date_parser | 날짜 변환 시 사용할 함수

# nrows | 파일의 첫 일부만 읽어올 때 처음 몇 줄을 읽을 것인지 지정한다.

# iterator | 파일을 조금씩 읽을 때 사용하도록 TextParser 객체를 반환하도록 한다. 기본값은 False

# chunksize | TextParser 객체에서 사용할, 한 번에 읽을 파일의 크기

# skip_footer | 무시할 파일의 마지막 줄 수

# verbose | 파싱 결과에 대한 정보를 출력한다. 숫자가 아닌 값들이 들어있는 칼럼이면서 누락된 값이 있다면 줄 번호를 출력한다. 기본값은 False

# encoding | 유니코드 인코딩 종류를 지정한다. UTF-8로 인코딩된 텍스트일 경우 'utf-8'로 지정한다.

# squeeze | 로우가 하나뿐이라면 Series 객체를 반환한다. 기본값은 False

# thousands | 숫자를 천 단위로 끊을 때 사용할 ', '나 '.' 같은 구분자

# ### 1.2 텍스트 파일 일부분만 읽기

# In[ ]:

result = pd.read_csv('data/ex6.csv')

# In[ ]:

print(result.shape)

result.head()

# #### nrows: 처음 몇 줄만 읽기

# In[ ]:

res = pd.read_csv('data/ex6.csv', nrows=5)

# In[ ]:

print(res.shape)

res

# #### chunksize: 일정량의 데이터를 순회하면서 조회 가능

# In[ ]:

chunker = pd.read_csv('data/ex6.csv', chunksize=10)

# In[ ]:

for i, x in enumerate(chunker):

if i > 0: break

print(x)

print(type(x))

# In[ ]:

chunker.close()

# ### 1.3 텍스트 파일에 쓰기

# In[ ]:

data = pd.read_csv('data/ex5.csv')

data

# In[ ]:

data.to_csv('data/out.csv')

# In[ ]:

import sys

# In[ ]:

data.to_csv(sys.stdout, sep='|')

# #### na_rep: NaN(결측치)을 원하는 값(문자열)으로 출력

# In[ ]:

data.to_csv(sys.stdout, na_rep='NULL')

# #### index, header

# In[ ]:

data.to_csv(sys.stdout, index=False, header=False)

# In[ ]:

data.to_csv('data/out1.csv', index=False, header=False)

# In[ ]:

#!cat data/temparary

get_ipython().system('type data\\out1.csv')

# #### 컬럼 일부분만 출력, 컬럼 순서 지정

# In[ ]:

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

# ### 1.4 텍스트 파일 수동 처리(읽기, 쓰기)

# #### CSV 파일 읽기

# In[ ]:

#!cat data/ex7.csv

get_ipython().system('type data\\ex7.csv')

# In[ ]:

import csv

# In[ ]:

f = open('data/ex7.csv')

reader = csv.reader(f)

for line in reader:

print(line)

f.close()

# In[ ]:

lines = list(csv.reader(open('data/ex7.csv')))

header, values = lines[0], lines[1:]

header

# In[ ]:

values

# In[ ]:

data_dict = {h: v for h, v in zip(header, zip(*values))}

data_dict

# In[ ]:

pd.DataFrame(data_dict)

# #### 사용자정의: 다양한 규칙을 클래스로 구현 (csv.Dialect 상속)

# - 다양한 구분자

# - 문자열을 둘러싸는 방법

# - 개행문자

# In[ ]:

class my_dialect(csv.Dialect):

lineterminator = '\n'

delimiter = ','

quotechar = '"'

quoting = csv.QUOTE_MINIMAL

# In[ ]:

f = open('data/ex7.csv')

reader = csv.reader(f, dialect=my_dialect)

# In[ ]:

for line in reader:

print(line)

# In[ ]:

f.close()

# In[ ]:

f = open('data/ex7.csv')

reader = csv.reader(f, delimiter=',')

# In[ ]:

for line in reader:

print(line)

# In[ ]:

f.close()

# #### CSV 파일 쓰기

# In[ ]:

with open('data/out3.csv', 'w') as f:

writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)

writer.writerow(('one', 'two', 'three'))

writer.writerow(('1', '2', '3'))

writer.writerow(('4', '5', '6'))

writer.writerow(('7', '8', '9'))

# In[ ]:

#!cat data/mydata.csv

get_ipython().system('type data\\out3.csv')

# ### 1.5 엑셀 파일

# In[ ]:

df = pd.read_excel('data/score.xlsx', sheet_name='result',encoding='cp949')

# In[ ]:

#import openpyxl

# In[ ]:

df.to_excel('data/out_score.xlsx', sheet_name='result')

# ### 1.6 JSON 파일

# - JSON(JavaScript Object Notation)은 웹브라우저와 다른 애플리케이션이 HTTP 요청으로 데이터를 보낼 때 널리 사용하는 표준 파일 형식

# #### Library: Pandas

# - read_json()

# - to_json()

# #### url 사용

# In[ ]:

# Create URL to JSON file (alternatively this can be a filepath)

url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.json'

# Load the first sheet of the JSON file into a data frame

df = pd.read_json(url, orient='columns')

# View the first ten rows

df.head()

# #### file 사용

# In[ ]:

df = pd.DataFrame([['a', 'b'], ['c', 'd']],

index=['row 1', 'row 2'],

columns=['col 1', 'col 2'])

# In[ ]:

fname = 'data/out_test.json'

# In[ ]:

df.to_json(fname, orient='split')

# In[ ]:

#!cat data/out_test.json

get_ipython().system('type data\\out_test.json')

# In[ ]:

pd.read_json(fname, orient='split')

# #### buffer 사용

# In[ ]:

temp = df.to_json(orient='split')

print(temp)

pd.read_json(temp, orient='split')

# In[ ]:

temp = df.to_json(orient='index')

print(temp)

pd.read_json(temp, orient='index')

# In[ ]:

temp = df.to_json(orient='columns')

print(temp)

pd.read_json(temp, orient='columns')

# In[ ]:

temp = df.to_json(orient='records')

print(temp)

pd.read_json(temp, orient='records')

# In[ ]:

temp = df.to_json(orient='table')

print(temp)

pd.read_json(temp, orient='table')

# #### Library: json

# - json.loads()

# - json.dumps()

# In[ ]:

import json

# #### json.load()

# In[ ]:

obj = '''

{

"name": "Wes",

"places_lived": ["United States", "Spain", "Germany"],

"pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},

{"name": "Katie", "age":33, "pet": "Cisco"}]

}

'''

# In[ ]:

result = json.loads(obj)

print(type(result))

result

# In[ ]:

get_ipython().run_cell_magic('writefile', 'data/out_test2.json', '{\n "name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},\n {"name": "Katie", "age":33, "pet": "Cisco"}]\n}')

# In[ ]:

json_data = open('data/out_test2.json').read()

result = json.loads(json_data)

print(type(result))

result

# #### json.dumps()

# In[ ]:

asjson = json.dumps(result)

# In[ ]:

print(type(asjson))

asjson

# In[ ]:

open('data/out_test3.json', 'w').write(asjson)

# In[ ]:

#!cat data/out_test3.json

get_ipython().system('type data\\out_test3.json')

# #### JSON 객체 사용예

# In[ ]:

siblings = DataFrame(result['siblings'], columns=['name', 'age'])

# In[ ]:

siblings

# ---

# # 2. 바이너리 파일

# ### 2.1 pickle

# #### 데이터를 효율적으로 저장하는 가장 손쉬운 방법

# - 파이썬에 기본으로 내장되어 있는 pickle 직렬화를 통해 데이터를 이진 형식으로 저장하는 것이다.

# - 편리하게도 pandas의 객체는 모두 pickle을 이용해서 데이터를 저장하는 save 메서드가 있다

# In[ ]:

frame = pd.read_csv('data/ex1.csv')

# In[ ]:

frame

# In[ ]:

frame.to_pickle('data/frame_pickle.dat')

# In[ ]:

pd.read_pickle('data/frame_pickle.dat')

# ### 2.2 HDF5 ( Hierarchical Data Format )

# - 계층적 데이터 형식

# - **HDF5는 데이터베이스가 아니다.** HDF5는 **한 번만 기록**하고 **여러 번 자주 읽어야** 하는 데이터에 최적화되어 있다.

# - 데이터는 아무때나 파일에 추가할 수 있지만 만약 여러 곳에서 동시에 파일을 쓴다면 파일이 깨지는 문제가 발생할 수 있다.

# ---

# # 3. HTML, 웹 API와 함께 사용하기

# - HTML

# - API - json, xml

# - Java Script - Library(selenium) 사용

# In[ ]:

import requests

# In[ ]:

url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'

# In[ ]:

res = requests.get(url)

# In[ ]:

data = res.json()

print(type(data))

data[:3]

# In[ ]:

iss = pd.DataFrame(data)

iss.head()

# ---

# # 4. 데이터베이스(DBMS)

# [sqlite3 사용 및 설치](https://wikidocs.net/12453)

# In[ ]:

import sqlite3

# #### DB: Memory 사용

# #### 1) Connect to DBMS

# In[ ]:

con = sqlite3.connect(':memory:')

# #### 2) Create Table

# In[ ]:

query = """

CREATE TABLE test

(a VARCHAR(20), b VARCHAR(20),

c REAL, d INTEGER

);"""

con.execute(query)

con.commit()

# #### 3) Insert Data

# In[ ]:

data = [('Atlanta', 'Georgia', 1.25, 6),

('Tallahassee', 'Florida', 2.6, 3),

('Sacramento', 'California', 1.7, 5)]

stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data)

con.commit()

# #### 4) Select Data

# In[ ]:

cursor = con.execute('select * from test')

# In[ ]:

rows = cursor.fetchall()

# In[ ]:

rows

# In[ ]:

cursor.description

# In[ ]:

pd.DataFrame(rows, columns=list(zip(*cursor.description))[0])

# #### 5) select Data ( Pandas 사용 )

# In[ ]:

import pandas.io.sql as sql

# In[ ]:

sql.read_sql('select * from test', con)

# #### 6) Disconnect from DBMS

# In[ ]:

con.close()

# #### DB: File 사용

# #### 1) Connect to DBMS

# In[ ]:

# 파일이 없으면 생성

con1 = sqlite3.connect('data/sqlite_01.db')

# #### 2) Create Table

# In[ ]:

query1 = """

CREATE TABLE score

('번호' VARCHAR(5), '이름' VARCHAR(64),

'kor' INTEGER, 'eng' INTEGER, 'math' INTEGER

);

"""

con1.execute(query1)

con1.commit()

# #### 3) Insert Data

# In[ ]:

data1 = [

('1','김지훈',90,80,85),

('2','김동현',90,85,90),

('3','박현우',80,80,80),

('4','박성민',85,90,75),

('5','서주원',95,95,100),

('6','윤석원',75,85,80),

('7','이서연',70,65,40),

('8','정민서',65,80,85),

('9','최수진',95,90,95),

('10','황미영',75,85,100)

]

stmt1 = "INSERT INTO score VALUES(?, ?, ?, ?,?)"

con1.executemany(stmt1, data1)

con1.commit()

# #### 4) Select Data

# In[ ]:

sql.read_sql('select * from score', con1)

# #### 5) Disconnect from DBMS

# In[ ]:

con1.close()

# ---

# In[ ]:

# end of file

# In[ ]:

LIST

저작자표시

'Programming' 카테고리의 다른 글

[Python][Library] 2 Pandas - 4. 그룹 연산 (0)	2022.01.02
[Python][Library] 2 Pandas - 3 데이터 처리 (0)	2022.01.02
[Python][Library] 2 Pandas - 1. 자료구조 (0)	2022.01.02
[Python][Library] 1 Numpy - tutorial (0)	2022.01.02
[Python][Library] 1 Numpy - 배열과 벡터 (0)	2022.01.02

Data Scientist Story For Sustainability

[Python][Library] 2 Pandas - 2. 파일 입출력

'Programming' 카테고리의 다른 글

티스토리툴바

[Python][Library] 2 Pandas - 2. 파일 입출력

'Programming' 카테고리의 다른 글

'Programming' Related Articles

티스토리툴바