반응형
In [1]:
import pandas as pd
import numpy as np
import platform
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# 폰트설정
from matplotlib import font_manager, rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False
In [3]:
path = './data/05. population_raw_data.xlsx'
data = pd.read_excel(path, header=1)
data.head(3)
Out[3]:
In [4]:
data.fillna(method='pad', inplace=True)
data.rename(columns={'행정구역(동읍면)별(1)':'광역시도',
'행정구역(동읍면)별(2)':'시도',
'계':'인구수'}, inplace=True)
data=data[data['시도']!='소계']
data.head(3)
Out[4]:
In [5]:
data.is_copy = False
data.rename(columns={'항목':'구분'}, inplace=True)
data.loc[data['구분'] == '총인구수 (명)', '구분'] = '합계'
data.loc[data['구분'] == '남자인구수 (명)', '구분'] = '남자'
data.loc[data['구분'] == '여자인구수 (명)', '구분'] = '여자'
data.head(3)
Out[5]:
In [6]:
data['20-39세'] = data['20 - 24세']+ data['25 - 29세'] + data['30 - 34세'] + data['35 - 39세']
data['65세이상'] = data['65 - 69세']+ data['70 - 74세'] + data['75 - 79세'] + data['80 - 84세']\
+ data['85 - 89세'] + data['90 - 94세'] + data['95 - 99세'] + data['100+']
data.head(3)
Out[6]:
In [7]:
pop = pd.pivot_table(data, index=['광역시도', '시도'],
columns=['구분'],
values=['인구수', '20-39세', '65세이상'])
pop.head(3)
Out[7]:
- 인구 소멸비율: 20-30대 여자 / (65세 이상/2)
- 소멸위기지역: 소멸비율 < 1.0
In [8]:
pop['소멸비율'] = pop['20-39세', '여자'] / (pop['65세이상', '합계']/2)
pop['소멸위기지역'] = pop['소멸비율'] <1.0
# pop['소멸비율'] < 1.0 하면, 값이 True/False로 나오게 된다.
multiple columns에서 값 뽑기: df.index.get_level_values(1)을 이용하여, 소멸위기지역인 곳을 뽑아 낼 수 있다.
In [9]:
pop[pop['소멸위기지역']==True].index.get_level_values(1)
Out[9]:
In [10]:
pop.reset_index(inplace=True)
pop.head(3)
Out[10]:
In [11]:
pop.columns = [pop.columns.get_level_values(0)[n] + pop.columns.get_level_values(1)[n] for n in range(0, len(pop.columns.get_level_values(0)))]
pop.head(3)
Out[11]:
get_level_values()에 해당하는 것은 다음그림과 같다!¶
In [12]:
pop.info()
2-2. 위기 지역을 지도로 보여주기¶
In [13]:
pop['시도'].unique()
Out[13]:
2-2-1. 시와 구로 나누어 dictionary에 저장하기¶
~광역시가 아니면서 구를 가지고 있는 시와 행정구를 dict에 저장한다.
In [14]:
si = [None]*len(pop)
tmp_gu_dict = {'수원':['장안구', '권선구', '팔달구', '영통구'],
'성남':['수정구', '중원구', '분당구'],
'안양':['만안구', '동안구'],
'안산':['상록구', '단원구'],
'고양':['덕양구', '일산동구', '일산서구'],
'용인':['처인구', '기흥구', '수지구'],
'청주':['상당구', '서원구', '흥덕구', '청원구'],
'천안':['동남구', '서북구'],
'전주':['완산구', '덕진구'],
'포항':['남구', '북구'],
'창원':['의창구', '성산구', '진해구', '마산합포구', '마산회원구'],
'부천':['오정구', '원미구', '소사구']}
In [15]:
for n in pop.index:
# [-3:] 끝 3글자
if pop['광역시도'][n][-3:] not in ['광역시', '특별시', '자치시']:
# [:-1] 끝 1글자 빼고 나머지
if pop['시도'][n][:-1] =='고성' and pop['광역시도'][n] =='강원도':
si[n] = '고성(강원)'
elif pop['시도'][n][:-1]=='고성' and pop['광역시도'][n] =='경상남도':
si[n] = '고성(경남)'
else:
try:
si[n] = pop['시도'][n][:-1]
except:
pass
for keys, values in tmp_gu_dict.items():
if pop['시도'][n] in values:
if len(pop['시도'][n])==2:
si[n] = keys + ' '+pop['시도'][n]
elif pop['시도'][n] in ['마산합포구', '마산회원구']:
si[n] = keys + ' '+pop['시도'][n][2:-1]
else:
si[n] = keys + ' '+pop['시도'][n][:-1]
elif pop['광역시도'][n] == '세종특별자치시':
si[n]= '세종'
else:
if len(pop['시도'][n]) == 2:
si[n] = pop['광역시도'][n][:2]+ ' '+pop['시도'][n]
else:
# '시도'가 3글자 이상이면 보통 뒤에 구, 군, 시 등이 붙으므로 2글자로 맞춰줌
si[n] = pop['광역시도'][n][:2]+ ' '+ pop['시도'][n][:-1]
si
Out[15]:
In [16]:
pop['ID'] = si
pop.head(3)
Out[16]:
2-2-2. 필요없는 데이터 columns들을 지워주자¶
In [17]:
del pop['20-39세남자']
del pop['65세이상남자']
del pop['65세이상여자']
pop.head(3)
Out[17]:
2-2-3. 교재는 편하다.¶
ID로 나눈, excel파일을 불러오자
In [18]:
draw_korea_raw = pd.read_excel('./data/05. draw_korea_raw.xlsx')
draw_korea_raw.head(3)
Out[18]:
각 지역별 위치 x,y 좌표인데,
In [19]:
draw_korea_raw_starcked = pd.DataFrame(draw_korea_raw.stack())
draw_korea_raw_starcked.reset_index(inplace=True)
draw_korea_raw_starcked.rename(columns={'level_0':'y', 'level_1':'x', 0:'ID'}, inplace=True)
draw_korea_raw_starcked.head()
Out[19]:
In [20]:
draw_korea = draw_korea_raw_starcked
In [21]:
BORDER_LINES = [[(5, 1), (5, 2), (7, 2), (7, 3), (11,3), (11, 0)],
[(5, 4), (5, 5), (2, 5), (2, 7), (4, 7), (4, 9), (7, 9), (7, 7), (9, 7), (9, 5), (10, 5), (10, 4), (5, 4)],
[(1, 7), (1, 8), (3, 8), (3, 10), (10, 10), (10, 7), (12, 7), (12, 6), (11, 6), (11, 5), (12, 5), (12, 4), (11, 4),(11, 3)],
[(8, 10), (8, 11), (6, 11), (6, 12)],
[(12, 5), (13, 5), (13, 4), (14, 4), (14, 5), (15, 5), (15, 4), (16, 4), (16, 2)],
[(16, 4), (17, 4), (17, 5), (16, 5), (16, 6), (19, 6), (19, 5), (20, 5), (20, 4), (21, 4), (21, 3), (19, 3), (19, 1)],
[(13, 5), (13, 6), (16, 6)],
[(13, 5), (14, 5)],
[(21, 2), (21, 3), (22, 3), (22, 4), (24, 4),(24,2), (21, 2)],
[(20, 5), (21, 5), (21, 6), (23, 6)],
[(10, 8), (12, 8), (12, 9), (14, 9), (14, 8), (16, 8), (16, 6)],
[(14, 9), (14, 11), (14, 12), (13, 12), (13, 13)],
[(15, 8), (17, 8), (17, 10), (16, 10), (16, 11), (14, 11)],
[(17, 9), (18, 9), (18, 8), (19, 8), (19, 9), (20, 9), (20, 10), (21, 10)],
[(16, 11), (16, 13)],
[(27, 5), (27, 6), (25, 6)],
]
In [22]:
plt.figure(figsize=(8, 11))
for idx, row in draw_korea.iterrows():
if len(row['ID'].split())==2:
dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
elif row['ID'][:2] =='고성':
dispname = '고성'
else:
dispname = row['ID']
if len(dispname.splitlines()[-1]) >= 3:
fontsize, linespacing = 9.5, 1.5
else:
fontsize, linespacing = 11, 1.2
plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize= fontsize, ha='center', va='center', linespacing=linespacing)
for path in BORDER_LINES:
ys, xs = zip(*path)
plt.plot(xs, ys, c='black', lw=1.5)
plt.gca().invert_yaxis()
plt.axis('off')
plt.tight_layout()
plt.show()
2-3. draw_korea와 pop을 합쳐보자¶
pop['ID']에서 우리가 만든 draw_korea['ID']에 없는 애들은 빼줍시다.
In [23]:
tmp_list = list(set(pop['ID'].unique()) - set(draw_korea['ID'].unique()))
for tmp in tmp_list:
pop = pop.drop(pop[pop['ID']==tmp].index)
tmp_list
Out[23]:
pop에 있는 데이터들 + drwa_korea에 있던 좌표까지 입력 쫩쫩!
In [24]:
pop = pd.merge(pop, draw_korea, how='left', on=['ID'])
pop.head()
Out[24]:
2-4. 이제, 정보를 데이터에 그려봅시다!¶
In [25]:
def drawKorea(targetData, blockedMap, cmapname):
gamma = .75
whitelabelmin = (max(blockedMap[targetData]) - min(blockedMap[targetData]))*0.25 + min(blockedMap[targetData])
datalabel = targetData
vmin = min(blockedMap[targetData])
vmax = max(blockedMap[targetData])
mapdata = blockedMap.pivot_table(index='y', columns = 'x', values = targetData)
masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
plt.figure(figsize = (6, 8))
plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor='#aaaaaa', linewidth=0.5)
for idx, row in blockedMap.iterrows():
if len(row['ID'].split())==2:
dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
elif row['ID'][:2] =='고성':
dispname = '고성'
else:
dispname = row['ID']
if len(dispname.splitlines()[-1]) >= 3:
fontsize, linespacing = 8, 1.1
else:
fontsize, linespacing = 9, 0.9
annocolor = 'white' if row[targetData] > whitelabelmin else 'black'
plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize= fontsize, ha='center', va='center', linespacing=linespacing)
for path in BORDER_LINES:
ys, xs = zip(*path)
plt.plot(xs, ys, c='black', lw=2)
plt.gca().invert_yaxis()
plt.axis('off')
cb =plt.colorbar(shrink=.1, aspect=10)
cb.set_label(datalabel)
plt.tight_layout()
plt.show()
In [26]:
drawKorea('인구수합계', pop, 'Blues')
소멸위기지역¶
True, False를 1, 0으로 나타내기
In [27]:
pop['소멸위기지역'] = [1 if con else 0 for con in pop['소멸위기지역']]
drawKorea('소멸위기지역', pop, 'Reds')
In [28]:
def drawKorea2(targetData, blockedMap, cmapname):
gamma = .75
whitelabelmin = 20.
datalabel = targetData
#음수가 존재할 수 있기 때문에 절대값으로 변경
tmp_max = max([np.abs(min(blockedMap[targetData])), np.abs(max(blockedMap[targetData]))])
vmin, vmax = -tmp_max, tmp_max
mapdata = blockedMap.pivot_table(index='y', columns = 'x', values = targetData)
masked_mapdata = np.ma.masked_where(np.isnan(mapdata), mapdata)
plt.figure(figsize = (6, 8))
plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor='#aaaaaa', linewidth=0.5)
for idx, row in blockedMap.iterrows():
if len(row['ID'].split())==2:
dispname = '{}\n{}'.format(row['ID'].split()[0], row['ID'].split()[1])
elif row['ID'][:2] =='고성':
dispname = '고성'
else:
dispname = row['ID']
if len(dispname.splitlines()[-1]) >= 3:
fontsize, linespacing = 8, 1.1
else:
fontsize, linespacing = 9, 0.9
annocolor = 'white' if row[targetData] > whitelabelmin else 'black'
plt.annotate(dispname, (row['x']+0.5, row['y']+0.5), weight='bold', fontsize= fontsize, ha='center', va='center', color=annocolor, linespacing=linespacing)
for path in BORDER_LINES:
ys, xs = zip(*path)
plt.plot(xs, ys, c='black', lw=2)
plt.gca().invert_yaxis()
plt.axis('off')
cb =plt.colorbar(shrink=.1, aspect=10)
cb.set_label(datalabel)
plt.tight_layout()
plt.show()
In [29]:
pop['여성비']= (pop['인구수여자']/pop['인구수합계'] -0.5) * 100
In [30]:
drawKorea2('여성비', pop, 'RdBu')
2030여성비¶
In [31]:
pop['2030여성비'] = (pop['20-39세여자']/pop['20-39세합계']-0.5)*100
In [32]:
drawKorea2('2030여성비', pop, 'RdBu')
반응형
'Study > 파이썬으로 데이터 주무르기' 카테고리의 다른 글
[19대 선거] Selenium을 이용한 19대 선거 데이터 크롤링 (2) | 2018.07.16 |
---|---|
[Folium으로 지도 그리기] folium으로 지도그리기 feat.인구소멸 위기지역 (0) | 2018.07.13 |
[서울시 범죄율] folium을 이용한 시각화 (0) | 2018.07.11 |
[Linear regression] Boston dataset에 실제 적용해 보기 (0) | 2018.07.09 |
[Linear regression] 단일선형회귀분석 실습 (0) | 2018.07.09 |
댓글