In [1]:

import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

재색인¶

새로운 색인에 적합하도록 객체를 새로 생성

데이터를 새로운 색인에 맞게 재배열하고, 존재하지 않는 색인 값이 값이 있다면 비어 있는 값(NaN)을 새로 추가

In [3]:

s = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

s

Out[3]:

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [4]:

s2 = s.reindex(['a', 'b', 'c', 'd', 'e'])

s2

Out[4]:

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

method 옵션 중 ffill같은 메서드를 이용해 누락된 값을 직전의 값으로 채울 수 있다

In [5]:

s3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])

s3

Out[5]:

0      blue
2    purple
4    yellow
dtype: object

s3.reindex(np.arange(6), method = 'ffill') reindex는 행, 열 또는 둘 다 변경 가능하다

In [7]:

frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index = ['a', 'c', 'd'],
                    columns = ['ohio', 'texas', 'california'])

frame

Out[7]:

	ohio	texas	california
a	0	1	2
c	3	4	5
d	6	7	8

In [8]:

frame2 = frame.reindex(index = ['a', 'b', 'c', 'd'])

frame2

Out[8]:

	ohio	texas	california
a	0.0	1.0	2.0
b	NaN	NaN	NaN
c	3.0	4.0	5.0
d	6.0	7.0	8.0

In [11]:

states = ['texas', 'utah', 'california']

frame.reindex(columns = states) # column명 지정

# 다른 방법
frame.reindex(states, axis = 'columns') 

Out[11]:

	texas	utah	california
a	1	NaN	2
c	4	NaN	5
d	7	NaN	8

Out[11]:

	texas	utah	california
a	1	NaN	2
c	4	NaN	5
d	7	NaN	8

loc 연산자를 이용해 재색인 가능

In [14]:

frame.loc[['a', 'd', 'c'], ['california', 'texas']]

Out[14]:

	california	texas
a	2	1
d	8	7
c	5	4

하나의 행이나 열 삭제

In [15]:

s = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])

s

Out[15]:

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [17]:

new_s = s.drop('c')

new_s

Out[17]:

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [19]:

s.drop(['d', 'c'])

Out[19]:

a    0.0
b    1.0
e    4.0
dtype: float64

DataFrame에서는 행과 열 모두에서 색인값을 삭제할 수 있다

In [20]:

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index = ['ohio', 'colorado', 'utah', 'new york'],
                    columns = ['one', 'two', 'three', 'four'])

data

Out[20]:

	one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

In [21]:

data.drop(index = ['colorado', 'ohio']) # 삭제하고 싶은 행 이름을 입력

Out[21]:

	one	two	three	four
utah	8	9	10	11
new york	12	13	14	15

In [24]:

data.drop(columns = ['two'])

Out[24]:

	one	three	four
ohio	0	2	3
colorado	4	6	7
utah	8	10	11
new york	12	14	15

열의 값을 삭제할 때는 axis = 1 or axis = 'columns'를 입력

In [25]:

data.drop('two', axis = 1)

data.drop(['two', 'four'], axis = 'columns')

Out[25]:

	one	three	four
ohio	0	2	3
colorado	4	6	7
utah	8	10	11
new york	12	14	15

Out[25]:

	one	three
ohio	0	2
colorado	4	6
utah	8	10
new york	12	14

색인하기, 선택하기, 거르기

In [26]:

s = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])

s

Out[26]:

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [27]:

s['b']

Out[27]:

1.0

In [28]:

s[1]

Out[28]:

1.0

In [29]:

s[2:4]

Out[29]:

c    2.0
d    3.0
dtype: float64

In [30]:

s[['b', 'a', 'd']]

Out[30]:

b    1.0
a    0.0
d    3.0
dtype: float64

In [31]:

s[[1, 3]]

Out[31]:

b    1.0
d    3.0
dtype: float64

In [32]:

s[s < 2]

Out[32]:

a    0.0
b    1.0
dtype: float64

특수 연산자인 loc을 이용

In [33]:

s.loc[['b', 'a', 'c']]

Out[33]:

b    1.0
a    0.0
c    2.0
dtype: float64

In [34]:

s1 = pd.Series([1, 2, 3], index = [2, 0, 1])
s2 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])


s1
s2

Out[34]:

2    1
0    2
1    3
dtype: int64

Out[34]:

a    1
b    2
c    3
dtype: int64

In [38]:

s1.loc[[0, 1, 2]]
s2.loc[['b', 'a', 'c']]

Out[38]:

0    2
1    3
2    1
dtype: int64

Out[38]:

b    2
a    1
c    3
dtype: int64

iloc는 정수로만 색인을 취한다

In [41]:

s1.iloc[[0, 1, 2]] # 0번쩨, 1번째, 2번째
s2.iloc[[0, 1, 2]]

Out[41]:

2    1
0    2
1    3
dtype: int64

Out[41]:

a    1
b    2
c    3
dtype: int64

레이블을 사용해 슬라이싱할 수 있지만 보통 파이썬 슬라이싱과 다르게 엔드포인트가 포함

In [42]:

s2.loc['b' : 'c']

Out[42]:

b    2
c    3
dtype: int64

In [43]:

s2.loc['b' : 'c'] = 5

s2

Out[43]:

a    1
b    5
c    5
dtype: int64

In [44]:

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index = ['ohio', 'colorado', 'utah', 'new york'],
                    columns = ['one', 'two', 'three', 'four'])

data

Out[44]:

	one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

In [45]:

data['two']

Out[45]:

ohio         1
colorado     5
utah         9
new york    13
Name: two, dtype: int64

In [46]:

data[['three', 'one']]

Out[46]:

	three	one
ohio	2	0
colorado	6	4
utah	10	8
new york	14	12

In [47]:

data[:2]

Out[47]:

	one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7

In [48]:

data.iloc[:2, :2]

Out[48]:

	one	two
ohio	0	1
colorado	4	5

In [49]:

data['three'] > 5
data[data['three'] > 5]

Out[49]:

ohio        False
colorado     True
utah         True
new york     True
Name: three, dtype: bool

Out[49]:

	one	two	three	four
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

In [52]:

data < 5

data[data < 5] = 0
data

Out[52]:

	one	two	three	four
ohio	True	True	True	True
colorado	True	False	False	False
utah	False	False	False	False
new york	False	False	False	False

Out[52]:

	one	two	three	four
ohio	0	0	0	0
colorado	0	5	6	7
utah	8	9	10	11
new york	12	13	14	15

※ loc과 iloc으로 선택하기 ※

In [55]:

# loc이용

data.loc['colorado']
data.loc[['colorado']]
data.loc[['colorado', 'new york']]
data.loc['colorado', ['two', 'three']]

Out[55]:

one      0
two      5
three    6
four     7
Name: colorado, dtype: int64

Out[55]:

	one	two	three	four
colorado	0	5	6	7

Out[55]:

	one	two	three	four
colorado	0	5	6	7
new york	12	13	14	15

Out[55]:

two      5
three    6
Name: colorado, dtype: int64

In [56]:

# iloc이용

data.iloc[2]
data.iloc[[2]]
data.iloc[2, [3, 0, 1]]
data.iloc[[1, 2], [3, 0, 1]]

Out[56]:

one       8
two       9
three    10
four     11
Name: utah, dtype: int64

Out[56]:

	one	two	three	four
utah	8	9	10	11

Out[56]:

four    11
one      8
two      9
Name: utah, dtype: int64

Out[56]:

	four	one	two
colorado	7	0	5
utah	11	8	9

In [59]:

data.loc[:'utah', 'two']

Out[59]:

ohio        0
colorado    5
utah        9
Name: two, dtype: int64

In [60]:

data.iloc[:, :3]
data.iloc[:, :3][data.three > 5]

Out[60]:

	one	two	three
ohio	0	0	0
colorado	0	5	6
utah	8	9	10
new york	12	13	14

Out[60]:

	one	two	three
colorado	0	5	6
utah	8	9	10
new york	12	13	14

In [61]:

data.three > 5
data.loc[data.three > 5]

Out[61]:

ohio        False
colorado     True
utah         True
new york     True
Name: three, dtype: bool

Out[61]:

	one	two	three	four
colorado	0	5	6	7
utah	8	9	10	11
new york	12	13	14	15

산술 연산과 데이터 정렬¶

In [62]:

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, -2.5, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

s1
s2

Out[62]:

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

Out[62]:

a   -2.1
c   -2.5
e   -1.5
f    4.0
g    3.1
dtype: float64

In [63]:

s1 + s2

Out[63]:

a    5.2
c   -5.0
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

DataFrame의 경우 행과 열 모두에 정렬이 적용

In [64]:

df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),
                   columns = list('bcd'),
                   index = ['ohio', 'texas', 'colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                   columns = list('bde'),
                   index = ['utah','ohio', 'texas', 'oregon'])

df1
df2

Out[64]:

	b	c	d
ohio	0.0	1.0	2.0
texas	3.0	4.0	5.0
colorado	6.0	7.0	8.0

Out[64]:

	b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
oregon	9.0	10.0	11.0

In [65]:

df1 + df2

Out[65]:

	b	c	d	e
colorado	NaN	NaN	NaN	NaN
ohio	3.0	NaN	6.0	NaN
oregon	NaN	NaN	NaN	NaN
texas	9.0	NaN	12.0	NaN
utah	NaN	NaN	NaN	NaN

공동 열이나 행 레이블이 없는 DataFrame을 더하면 결과는 아무것도 나오지 않는다

In [66]:

df1 = pd.DataFrame({'A' : [1, 2]})
df2 = pd.DataFrame({'B' : [3, 4]})

df1
df2

Out[66]:

	A
0	1
1	2

Out[66]:

	B
0	3
1	4

In [67]:

df1 + df2

Out[67]:

	A	B
0	NaN	NaN
1	NaN	NaN

※ 산술 연산 메서드에 채워 넣을 값 지정하기 ※

In [68]:

df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))

df2.loc[1, 'b'] = np.nan # nan값 생성

df1
df2

Out[68]:

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

Out[68]:

	a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	NaN	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	19.0

In [69]:

df1 + df2

df1.add(df2, fill_value = 0) # df1의 'e'열을 0으로 채우기

Out[69]:

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	NaN	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

Out[69]:

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	5.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

Series나 DataFrame을 재색일할 때도 fill_value를 지정할 수 있다

In [70]:

df1.reindex(columns = df2.columns, fill_value = 0) # df1에는 'e'열이 없으므로 0으로 채우기

Out[70]:

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

※ DataFrame과 Series 간의 연산 ※

In [71]:

arr = np.arange(12.).reshape((3, 4))

arr

Out[71]:

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [73]:

arr[0]

arr - arr[0]

Out[73]:

array([0., 1., 2., 3.])

Out[73]:

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

※ 브로드캐스팅 ※

In [75]:

frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns = list('bde'),
                     index = ['utah', 'ohio', 'texas', 'oregon'])

series = frame.iloc[0]

frame 
series

Out[75]:

	b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
oregon	9.0	10.0	11.0

Out[75]:

b    0.0
d    1.0
e    2.0
Name: utah, dtype: float64

In [76]:

frame - series

Out[76]:

	b	d	e
utah	0.0	0.0	0.0
ohio	3.0	3.0	3.0
texas	6.0	6.0	6.0
oregon	9.0	9.0	9.0

만약 색인값을 DataFrame의 열이나 Series의 색인에서 찾을 수 없다면 그 객체는 형식을 맞추기 위해 재색인된다

In [78]:

series2 = pd.Series(np.arange(3), index = ['b', 'e', 'f'])

series2
frame

Out[78]:

b    0
e    1
f    2
dtype: int64

Out[78]:

	b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
oregon	9.0	10.0	11.0

In [79]:

frame + series2

Out[79]:

	b	d	e	f
utah	0.0	NaN	3.0	NaN
ohio	3.0	NaN	6.0	NaN
texas	6.0	NaN	9.0	NaN
oregon	9.0	NaN	12.0	NaN

In [81]:

series3 = frame['d']

series3
frame

Out[81]:

utah       1.0
ohio       4.0
texas      7.0
oregon    10.0
Name: d, dtype: float64

Out[81]:

	b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
oregon	9.0	10.0	11.0

In [83]:

frame.sub(series3, axis = 'index')

Out[83]:

	b	e
utah	-1.0	1.0
ohio	-1.0	1.0
texas	-1.0	1.0
oregon	-1.0	1.0

시퀀스 자료형, 인덱스 (1)	2023.12.23
리스트, 튜플 (0)	2023.12.23
입력 사용하기 (0)	2023.12.23
데이터 로딩과 저장, 파일 형식 (2)	2023.12.22
판다스 자료 구조 소개 (1)	2023.12.22

개발하고 싶어요

개발하고 싶어요

pandas 핵심 기능 - 재색인, 산술연산, 정렬 본문

pandas 핵심 기능 - 재색인, 산술연산, 정렬

재색인¶

산술 연산과 데이터 정렬¶

'PYTHON' 카테고리의 다른 글

티스토리툴바

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31