책/파이썬 데이터과학통계학습(23.01.03-23.01.09)(정보문화사)

part02데이터과학을 위한 파이썬

정지홍 2023. 1. 5. 19:32

넘파이:행렬/선형대수/통계

판다스:데이터 프로세싱

matplotlib,seaborn:시각화

scipy:수학/과학/통계 분석

scikit-learn:머신러닝

 


#배열 생성 연습
import numpy as np

data_1=[1,2,3,4,5]
data_2=[[1,2,3],[4,5,6]]
data_3=[[1,2.0,3],[4.0,5,6.0]]

vector_1=np.array(data_1)
matrix_1=np.array(data_2)
matrix_2=np.array(data_3)
matrix_3=np.array([[1,2,3],[4,5,6],[7,8,9]])

print(vector_1)
print(matrix_1)
print(matrix_2)
print(matrix_3)


[1 2 3 4 5]
[[1 2 3]
 [4 5 6]]
[[1. 2. 3.]
 [4. 5. 6.]]
[[1 2 3]
 [4 5 6]
 [7 8 9]]

 


#넘파이의 배열 생성을 위한 함수 연습
import numpy as np
print(np.zeros(5))  #요소의 값을 0으로 채운다.
print(np.zeros((2,5)))  #요소의 값은 2x5에 0으로 채운다
print(np.ones(5))   #요소값을 1로 채운다
print(np.empty((2,2,3)))    #크기만 지정하고 임의의 값이 채워진다
print(np.arange(5)) #range와 유사하지만 리스트가 아니라 넘파이 배열을 리턴한다.
print(np.full((3,3),7)) #해당 값으로 채워진 배열을 리턴한다


[0. 0. 0. 0. 0.]
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[1. 1. 1. 1. 1.]
[[[1.06444847e-311 2.81617418e-322 0.00000000e+000]
  [0.00000000e+000 0.00000000e+000 4.47032019e-038]]

 [[5.44736344e-090 1.63198008e+185 1.26900865e-076]
  [1.28925108e+165 6.48224659e+170 4.93432906e+257]]]
[0 1 2 3 4]
[[7 7 7]
 [7 7 7]
 [7 7 7]]

 


#배열의 형태를 확인한다
import numpy as np

data_1=[1,2,3,4,5]
data_2=[[1,2,3],[4,5,6]]
data_3=[[1,2.0,3],[4.0,5,6.0]]

vector_1=np.array(data_1)
matrix_1=np.array(data_2)
matrix_2=np.array(data_3)
matrix_3=np.array([[1,2,3],[4,5,6],[7,8,9]])

print(type(vector_1))   #넘파이 배열의 타입을 확인

#타입을 알려준다
print(vector_1.dtype)
print(matrix_1.dtype)
print(matrix_2.dtype)
print(matrix_3.dtype)

#몇 차원 배열인지 알려준다
print(vector_1.ndim)
print(matrix_1.ndim)
print(matrix_2.ndim)
print(matrix_3.ndim)

#행렬이 몇x몇 인지 알려준다.
print(vector_1.shape)
print(matrix_1.shape)
print(matrix_1.shape)
print(matrix_1.shape)

#행렬의 개수를 알려준다
print(vector_1.size)
print(matrix_1.size)
print(matrix_2.size)
print(matrix_3.size)

 


<class 'numpy.ndarray'>
int32
int32
float64
int32
1
2
2
2
(2, 3)

mport numpy as np

vector_1=np.array([1,2,3,4,5])

matrix_2=np.array([[1,2,3,4,5],
                    [6,7,8,9,10],
                    [11,12,13,14,15],
                    [16,17,18,19,20],
                    [21,22,23,24,25]
                    ])

print(vector_1[1])  #인덱스1 출력
print(vector_1[-1]) #인덱스-1 출력
print(matrix_2[1,1])    #인덱스 1,1 출력
print(matrix_2[2,-1])   #인덱스 2,-1출력

print(vector_1[:]) #모두
print(vector_1[:4]) #0번째부터 3까지 
print(vector_1[1:3])    #1부터 2까지
print(vector_1[2:]) #2부터 끝까지
print(vector_1[2:-1])   #2부터 뒤에서 2까지


2
5
7
15
[1 2 3 4 5]
[1 2 3 4]
[2 3]
[3 4 5]
[3 4]

 


mport numpy as np
matrix=np.array([[1,2,3,4,5],
                    [6,7,8,9,10],
                    [11,12,13,14,15],
                    [16,17,18,19,20],
                    [21,22,23,24,25]
                    ])

print(matrix[:2,3:])
print(matrix[:2,3:].shape)

print(matrix[4], matrix[4].shape)
print(matrix[4,:],matrix[4,:].shape)
print(matrix[4:,:],matrix[4:,:].shape)

print(matrix[:,:2],matrix[:,:2].shape)

print(matrix[1,:2],matrix[1,:2].shape)
print(matrix[1:2,:2],matrix[1:2,:2].shape)


[[ 4  5]
 [ 9 10]]
(2, 2)
[21 22 23 24 25] (5,)
[21 22 23 24 25] (5,)
[[21 22 23 24 25]] (1, 5)
[[ 1  2]
 [ 6  7]
 [11 12]
 [16 17]
 [21 22]] (5, 2)
[6 7] (2,)
[[6 7]] (1, 2)

 


import numpy as np
matrix=np.array([[1,2,3],[4,5,6],[7,8,9]])

print("sum:",np.sum(matrix))
print("sum:",matrix.sum())
print("mean:",matrix.mean())
print("mean:",np.mean(matrix))
print()

print("std:",np.std(matrix))
print("variance:",np.var(matrix))
print("max:",np.max(matrix))
print("min:",np.min(matrix))
print()

print("axis=0 , max:",np.max(matrix,axis=0))
print("axis=0 , min:",np.min(matrix,axis=0))
print("axis=0 , avg:",np.mean(matrix,axis=0))

print("axis=1 , max:",np.max(matrix,axis=1))
print("axis=1 , min:",np.min(matrix,axis=1))
print("axis=1 , avg:",np.mean(matrix,axis=1))


sum: 45
sum: 45
mean: 5.0
mean: 5.0

std: 2.581988897471611
variance: 6.666666666666667
max: 9
min: 1

axis=0 , max: [7 8 9]
axis=0 , min: [1 2 3]
axis=0 , avg: [4. 5. 6.]
axis=1 , max: [3 6 9]
axis=1 , min: [1 4 7]
axis=1 , avg: [2. 5. 8.]

import numpy as np
np.random.seed(0)   #난수 생성의 시드를 지정
print("0 to 1.0 rand num 10 : ",np.random.random(10))
print("1 to 45 rand num 6 : ",np.random.randint(0,46,6))
print("avg=0.0 and std=1.0 normal distribution num 5: ",np.random.normal(0.0,1.0,5))
print("1.0< x < 5.0 num 5:  ",np.random.uniform(1.0,5.0,5))

print("0.0 < x < 1.0 num 3 : ",np.random.random_sample((3,3)))
print("0.0 < x < 1.0 num 3 : ",np.random.uniform(0.0,1.0,(3,3)))
print("0.0 < x < 1.0 num 3 : ",np.random.rand(3,3))

print("normal distribution rand num avg=0 and std=1 : \n",np.random.standard_normal((3,3)))
print("normal distribution rand num avg=0 and std=1 : \n",np.random.normal(0.0,1.0,(3,3)))
print("normal distribution rand num avg=0 and std=1 : \n",np.random.randn(3,3))

print("1~3 choice random:",np.random.choice([1,2,3],3))

vector=[1,2,3,4,5]
np.random.shuffle(vector)
print(vector)
print("\n\n\n")

vector=np.random.randn(5)
print(vector)
vector.sort()
print("after sort : ",vector)

print()
matrix_1=np.random.rand(5,5)
print("before sort matrix 1-->",matrix_1)
matrix_1.sort()
print("after sort matrix1-->",matrix_1)
print()

matrix_2=np.random.rand(5,5)
print("before sort matrix 2-->",matrix_2)
matrix_2.sort(axis=0)
print("after sort  axis=0  matrix 2-->",matrix_2)
print()

matrix_3=np.random.rand(5,5)
print("before sort matrix 3-->",matrix_3)
matrix_3.sort(axis=1)
print("after sort  axis=1  matrix 3-->",matrix_3)
print()

matrix_4=np.random.randn(5,5)
print("before sort 1st col:",matrix_4)
matrix_4[:0].sort()
print("after sort-->",matrix_4)


0 to 1.0 rand num 10 :  [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
 0.43758721 0.891773   0.96366276 0.38344152]
1 to 45 rand num 6 :  [38 39 23 24 17 37]
avg=0.0 and std=1.0 normal distribution num 5:  [-0.470771    0.973016   -1.27814912  1.43737068 -0.07770457]
1.0< x < 5.0 num 5:   [4.20364301 3.08190992 3.71551812 3.88253062 3.32807917]
0.0 < x < 1.0 num 3 :  [[0.53737323 0.75861562 0.10590761]
 [0.47360042 0.18633234 0.73691818]
 [0.21655035 0.13521817 0.32414101]]
0.0 < x < 1.0 num 3 :  [[0.14967487 0.22232139 0.38648898]
 [0.90259848 0.44994999 0.61306346]
 [0.90234858 0.09928035 0.96980907]]
0.0 < x < 1.0 num 3 :  [[0.65314004 0.17090959 0.35815217]
 [0.75068614 0.60783067 0.32504723]
 [0.03842543 0.63427406 0.95894927]]
normal distribution rand num avg=0 and std=1 : 
 [[ 1.08963016  1.25441407  1.41910204]
 [-0.74385608 -2.5174371  -1.50709602]
 [ 1.14907613 -1.19357825  1.14104245]]
normal distribution rand num avg=0 and std=1 : 
 [[ 1.50944508  1.06777513 -0.68658948]
 [ 0.01487332 -0.3756659  -0.03822364]
 [ 0.36797447 -0.0447237  -0.30237513]]
normal distribution rand num avg=0 and std=1 : 
 [[-2.2244036   0.72400636  0.35900276]
 [ 1.07612104  0.19214083  0.85292596]
 [ 0.01835718  0.42830357  0.99627783]]
1~3 choice random: [3 1 3]
[1, 2, 5, 3, 4]




[ 0.92525075 -0.90478616  1.84369153  1.52550724 -1.44553558]
after sort :  [-1.44553558 -0.90478616  0.92525075  1.52550724  1.84369153]

before sort matrix 1--> [[0.95274901 0.44712538 0.84640867 0.69947928 0.29743695]
 [0.81379782 0.39650574 0.8811032  0.58127287 0.88173536]
 [0.69253159 0.72525428 0.50132438 0.95608363 0.6439902 ]
 [0.42385505 0.60639321 0.0191932  0.30157482 0.66017354]
 [0.29007761 0.61801543 0.4287687  0.13547406 0.29828233]]
after sort matrix1--> [[0.29743695 0.44712538 0.69947928 0.84640867 0.95274901]
 [0.39650574 0.58127287 0.81379782 0.8811032  0.88173536]
 [0.50132438 0.6439902  0.69253159 0.72525428 0.95608363]
 [0.0191932  0.30157482 0.42385505 0.60639321 0.66017354]
 [0.13547406 0.29007761 0.29828233 0.4287687  0.61801543]]

before sort matrix 2--> [[0.56996491 0.59087276 0.57432525 0.65320082 0.65210327]
 [0.43141844 0.8965466  0.36756187 0.43586493 0.89192336]
 [0.80619399 0.70388858 0.10022689 0.91948261 0.7142413 ]
 [0.99884701 0.1494483  0.86812606 0.16249293 0.61555956]
 [0.12381998 0.84800823 0.80731896 0.56910074 0.4071833 ]]
after sort  axis=0  matrix 2--> [[0.12381998 0.1494483  0.10022689 0.16249293 0.4071833 ]
 [0.43141844 0.59087276 0.36756187 0.43586493 0.61555956]
 [0.56996491 0.70388858 0.57432525 0.56910074 0.65210327]
 [0.80619399 0.84800823 0.80731896 0.65320082 0.7142413 ]
 [0.99884701 0.8965466  0.86812606 0.91948261 0.89192336]]

before sort matrix 3--> [[0.069167   0.69742877 0.45354268 0.7220556  0.86638233]
 [0.97552151 0.85580334 0.01171408 0.35997806 0.72999056]
 [0.17162968 0.52103661 0.05433799 0.19999652 0.01852179]
 [0.7936977  0.22392469 0.34535168 0.92808129 0.7044144 ]
 [0.03183893 0.16469416 0.6214784  0.57722859 0.23789282]]
after sort  axis=1  matrix 3--> [[0.069167   0.45354268 0.69742877 0.7220556  0.86638233]
 [0.01171408 0.35997806 0.72999056 0.85580334 0.97552151]
 [0.01852179 0.05433799 0.17162968 0.19999652 0.52103661]
 [0.22392469 0.34535168 0.7044144  0.7936977  0.92808129]
 [0.03183893 0.16469416 0.23789282 0.57722859 0.6214784 ]]

before sort 1st col: [[ 0.37716061  0.1666735   0.63503144  2.38314477  0.94447949]
 [-0.91282223  1.11701629 -1.31590741 -0.4615846  -0.06824161]
 [ 1.71334272 -0.74475482 -0.82643854 -0.09845252 -0.66347829]
 [ 1.12663592 -1.07993151 -1.14746865 -0.43782004 -0.49803245]
 [ 1.92953205  0.94942081  0.08755124 -1.22543552  0.84436298]]
after sort--> [[ 0.37716061  0.1666735   0.63503144  2.38314477  0.94447949]
 [-0.91282223  1.11701629 -1.31590741 -0.4615846  -0.06824161]
 [ 1.71334272 -0.74475482 -0.82643854 -0.09845252 -0.66347829]
 [ 1.12663592 -1.07993151 -1.14746865 -0.43782004 -0.49803245]
 [ 1.92953205  0.94942081  0.08755124 -1.22543552  0.84436298]]

import numpy as np
matrix=np.random.randn(5,5)
print(matrix)

vector=np.array([5,3,1,2,4])

matrix[0]=vector
print("insert vector in matrix[0]:\n",matrix)
print("sort with argsort()\n",matrix[:,matrix[0].argsort()])


[[-1.00021535 -1.5447711   1.18802979  0.31694261  0.92085882]
 [ 0.31872765  0.85683061 -0.65102559 -1.03424284  0.68159452]
 [-0.80340966 -0.68954978 -0.4555325   0.01747916 -0.35399391]
 [-1.37495129 -0.6436184  -2.22340315  0.62523145 -1.60205766]
 [-1.10438334  0.05216508 -0.739563    1.5430146  -1.29285691]]
insert vector in matrix[0]:
 [[ 5.          3.          1.          2.          4.        ]
 [ 0.31872765  0.85683061 -0.65102559 -1.03424284  0.68159452]
 [-0.80340966 -0.68954978 -0.4555325   0.01747916 -0.35399391]
 [-1.37495129 -0.6436184  -2.22340315  0.62523145 -1.60205766]
 [-1.10438334  0.05216508 -0.739563    1.5430146  -1.29285691]]
sort with argsort()
 [[ 1.          2.          3.          4.          5.        ]
 [-0.65102559 -1.03424284  0.85683061  0.68159452  0.31872765]
 [-0.4555325   0.01747916 -0.68954978 -0.35399391 -0.80340966]
 [-2.22340315  0.62523145 -0.6436184  -1.60205766 -1.37495129]
 [-0.739563    1.5430146   0.05216508 -1.29285691 -1.10438334]]

import numpy as np
data=[  [1,2,3,4],
        [5,6,7,8],
        [9,10,11,12],
        [13,14,15,16],
        [17,18,19,20]]

matrix=np.array(data)
print("matrix.shape:",matrix.shape)

print("reshape 2x10 :",matrix.reshape(2,10))

print("size:",matrix.size)

print("reshape 1 , -1 --->",matrix.reshape(1,-1))
print("reshape 2 , -1 --->",matrix.reshape(2,-1))
print("reshape 4 , -1 --->",matrix.reshape(4,-1))
print("reshape 5 , -1 --->",matrix.reshape(5,-1))

print("reshape -1 --->",matrix.reshape(-1))
 
data=[  [1,2,3,4],
        [5,6,7,8],
        [9,10,11,12],
        [13,14,15,16],
        [17,18,19,20]]

matrix=np.array(data)

print("matrix.T:\n",matrix.T)
print("matrix.transpose()\n",matrix.transpose())
print("np.transpose(matrix)\n",np.transpose(matrix))
print("flatten",matrix.flatten())


matrix.shape: (5, 4)
reshape 2x10 : [[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]
size: 20
reshape 1 , -1 ---> [[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]]
reshape 2 , -1 ---> [[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]
reshape 4 , -1 ---> [[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
reshape 5 , -1 ---> [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]
 [17 18 19 20]]
reshape -1 ---> [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
matrix.T:
 [[ 1  5  9 13 17]
 [ 2  6 10 14 18]
 [ 3  7 11 15 19]
 [ 4  8 12 16 20]]
matrix.transpose()
 [[ 1  5  9 13 17]
 [ 2  6 10 14 18]
 [ 3  7 11 15 19]
 [ 4  8 12 16 20]]
np.transpose(matrix)
 [[ 1  5  9 13 17]
 [ 2  6 10 14 18]
 [ 3  7 11 15 19]
 [ 4  8 12 16 20]]
flatten [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

import numpy as np

matrix_1=np.array(
    [[1,2,3],
    [4,5,6],
    [7,8,9]]
)

matrix_2=np.array(
    [[2,4,6],
    [8,10,12],
    [14,16,18]]
)

print(matrix_1+2)
print(matrix_1-2)
print()
print(matrix_1/2)
print(matrix_1*2)
print()
print(matrix_1+matrix_2)
print(matrix_1-matrix_2)
print(matrix_1/matrix_2)
print(matrix_1*matrix_2)
print()
print(matrix_1+[10,10,10])
print(matrix_1+[[10],[10],[10]])
print(np.add(matrix_1,matrix_2))
print(np.subtract(matrix_1,matrix_2))
print(np.dot(matrix_1,matrix_2))


[[ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[-1  0  1]
 [ 2  3  4]
 [ 5  6  7]]

[[0.5 1.  1.5]
 [2.  2.5 3. ]
 [3.5 4.  4.5]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]

[[ 3  6  9]
 [12 15 18]
 [21 24 27]]
[[-1 -2 -3]
 [-4 -5 -6]
 [-7 -8 -9]]
[[0.5 0.5 0.5]
 [0.5 0.5 0.5]
 [0.5 0.5 0.5]]
[[  2   8  18]
 [ 32  50  72]
 [ 98 128 162]]

[[11 12 13]
 [14 15 16]
 [17 18 19]]
[[11 12 13]
 [14 15 16]
 [17 18 19]]
[[ 3  6  9]
 [12 15 18]
 [21 24 27]]
[[-1 -2 -3]
 [-4 -5 -6]
 [-7 -8 -9]]
[[ 60  72  84]
 [132 162 192]
 [204 252 300]]

import pandas as pd

series_list_1=pd.Series([1,3,5,7,9])
print(series_list_1)
print(series_list_1.index)
print(series_list_1.values)

series_list_2=pd.Series([2,4,6,8,10],index=['a','b','c','d','e'])
print(series_list_2)
print(series_list_2.index)
print(series_list_2.values)


0    1
1    3
2    5
3    7
4    9
dtype: int64
RangeIndex(start=0, stop=5, step=1)
[1 3 5 7 9]
a     2
b     4
c     6
d     8
e    10
dtype: int64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[ 2  4  6  8 10]

import pandas as pd
dict_data={'chicken':16000,'pizza':20000,'hambuger':10000,'friedMeat':25000}

series_list_a=pd.Series(dict_data)
print(series_list_a)
print()
delivery_foods=['chicken','pizza','friedMeat','jajangmyen']
series_list_b=pd.Series(dict_data , index=delivery_foods)
print(series_list_b)
print()
print(series_list_a['chicken'])
print(series_list_a[['chicken','pizza']])
print(series_list_a[series_list_a>19000])
print('chicken'in series_list_a)


chicken      16000
pizza        20000
hambuger     10000
friedMeat    25000
dtype: int64

chicken       16000.0
pizza         20000.0
friedMeat     25000.0
jajangmyen        NaN
dtype: float64

16000
chicken    16000
pizza      20000
dtype: int64
pizza        20000
friedMeat    25000
dtype: int64
True

import pandas as pd
df_profile=pd.DataFrame()

df_profile['Sex']=['M','M','W','W']
df_profile['Age']=[21,25,23,20]
df_profile['Name']=['Kim','Han','Park','Lee']
df_profile


SexAgeName0123

M 21 Kim
M 25 Han
W 23 Park
W 20 Lee

import pandas as pd
import numpy as np
list_col=['Sex','Age','Name']

list_a=[
    ['M',21,'Kim'],['M',25,'Han'],['W',23,'Park']
]

array_a=np.array(list_a)
df_list=pd.DataFrame(list_a, columns=list_col)  #dataFrame with list 
df_array=pd.DataFrame(array_a,columns=list_col) #dataFramw with numpyArray

print(df_list)
print(df_array)
print(df_list.shape,df_array.shape)

data_dic={'category':['chicken','pizza','hamburger','ramen','photato'],
            'price':[20000,14000,16000,5000,1000],
            'size':[13,12,9,5,2]
}
df_food=pd.DataFrame(data_dic)
print(df_food)


  Sex  Age  Name
0   M   21   Kim
1   M   25   Han
2   W   23  Park
  Sex Age  Name
0   M  21   Kim
1   M  25   Han
2   W  23  Park
(3, 3) (3, 3)
    category  price  size
0    chicken  20000    13
1      pizza  14000    12
2  hamburger  16000     9
3      ramen   5000     5
4    photato   1000     2

import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
df_t
print()
df_t.head()



import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
print(df_t.index)
print(df_t.columns)
print(df_t.values)
print(df_t.shape)
print(df_t.info())


RangeIndex(start=0, stop=27, step=1)
Index(['시.도', '시군구', '측정소명', '측정소코드', '장비\n가동률\n(%)', '유효\n자료\n획득률\n(%)',
       '유효\n측정\n일수\n(day)', '유효\n측정\n시간\n(hour)', '월평균\n(ppm)', '최저\n(ppm)',
       '최고\n(ppm)', '최고일시', '기준초과\n(회)', '초과율\n(%)', '최저\n(ppm).1',
       '최고\n(ppm).1', '최고일'],
      dtype='object')
[['인천' '강화군' '석모리' 831481 100.0 77.63 22 559 '0.0013' 0.001 0.0037
  2022060112 nan nan 0.0011 0.0019 20220601]
 ['인천' '옹진군' '덕적도' 831491 100.0 95.55 28 688 '0.0015' 0.001 0.0067
  2022061003 nan nan 0.0012 0.0023 20220605]
 ['경기' '이천' '설성면' 131441 100.0 97.22 29 700 '0.0015' 0.0009 0.0041
  2022061111 nan nan 0.0011 0.0021 20220602]
 ['경기' '파주' '파주' 131373 100.0 97.5 30 702 '0.0012' 0.0009 0.0024
  2022060319 nan nan 0.001 0.0015 20220601]
 ['경기' '포천' '관인면' 131451 100.0 97.91 30 705 '0.0011' 0.0005 0.0021
  2022061710 nan nan 0.0006 0.0013 20220608]
 ['경기' '연천' '연천(DMZ)' 131991 100.0 97.63 29 703 '0.0011' 0.0007 0.0024
  2022061710 nan nan 0.0009 0.0015 20220601]
 ['강원' '양구' '방산면' 132401 100.0 97.5 29 702 '0.0008' 0.0007 0.0022
  2022060211 nan nan 0.0007 0.0012 20220601]
 ['강원' '고성' '간성읍' 632421 100.0 91.11 27 656 '0.0011' 0.0008 0.0021
  2022060210 nan nan 0.0008 0.0015 20220601]
 ['강원' '고성' '인제(DMZ)' 132993 100.0 97.5 29 702 '0.0017' 0.0012 0.0025
  2022060211 nan nan 0.0015 0.002 20220601]
 ['강원' '고성' '고성(DMZ)' 132994 100.0 97.91 30 705 '0.0011' 0.0007 0.0022
  2022060202 nan nan 0.001 0.0015 20220601]
 ['강원' '정선' '북평면' 632371 100.0 97.36 29 701 '0.0016' 0.0013 0.0025
  2022060109 nan nan 0.0014 0.002 20220602]
 ['강원' '횡성' '치악산' 632431 96.49 91.8 27 661 '0.002' 0.0015 0.0026
  2022060118 nan nan 0.0018 0.0022 20220602]
 ['강원' '철원' '철원(DMZ)' 132991 100.0 97.91 29 705 '0.0015' 0.0012 0.003
  2022061711 nan nan 0.0014 0.0018 20220625]
 ['강원' '화천' '화천(DMZ)' 132992 100.0 99.44 30 716 '0.0013' 0.001 0.0018
  2022060107 nan nan 0.0011 0.0016 20220601]
 ['충북' '괴산' '청천면' 633361 100.0 96.8 29 697 '0.001' 0.0007 0.0021
  2022060815 nan nan 0.0008 0.0012 20220601]
 ['충북' '음성' '금왕' 633461 100.0 96.8 29 697 '0.001' 0.0006 0.0056
  2022060111 nan nan 0.0007 0.0018 20220601]
 ['충남' '태안' '파도리' 534461 100.0 97.63 30 703 '0.0012' 0.001 0.0036
  2022060512 nan nan 0.001 0.0017 20220610]
 ['충남' '공주' '사곡면' 534341 100.0 50.41 14 363 '0.0012*' 0.001 0.0019
  2022060217 nan nan 0.0011 0.0014 20220602]
 ['전북' '임실' '운암면' 735351 100.0 96.38 29 694 '0.0018' 0.0009 0.0028
  2022061012 nan nan 0.0017 0.002 20220610]
 ['전북' '부안' '새만금' 735172 100.0 97.08 29 699 '0.0016' 0.0005 0.0029
  2022061013 nan nan 0.0006 0.0026 20220601]
 ['전남' '화순' '송단리' 336451 100.0 97.5 29 702 '0.002' 0.0014 0.0081
  2022060420 nan nan 0.0017 0.0041 20220604]
 ['경북' '영덕' '강구면' 437202 100.0 95.27 28 686 '0.001' 0.0004 0.0064
  2022060121 nan nan 0.0007 0.0021 20220601]
 ['경북' '영천' '화북면' 437401 100.0 97.36 29 701 '0.0009' 0.0007 0.0028
  2022062020 nan nan 0.0007 0.0013 20220620]
 ['경북' '의성' '안계면' 437411 100.0 96.38 29 694 '0.0008' 0.0001 0.0021
  2022062023 nan nan 0.0003 0.0012 20220621]
 ['경남' '창원' '대산면' 238241 100.0 97.22 30 700 '0.0011' 0.0007 0.0037
  2022060210 nan nan 0.0008 0.0019 20220602]
 ['경남' '거제' '저구리' 238191 100.0 98.61 30 710 '0.0014' 0.0001 0.0054
  2022061111 nan nan 0.0012 0.002 20220611]
 ['경남' '거창' '남상면' 238481 92.3 90.0 27 648 '0.0021' 0.0017 0.0055
  2022060120 nan nan 0.0018 0.0028 20220601]]
(27, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   시.도              27 non-null     object 
 1   시군구              27 non-null     object 
 2   측정소명             27 non-null     object 
 3   측정소코드            27 non-null     int64  
 4   장비
가동률
(%)       27 non-null     float64
 5   유효
자료
획득률
(%)    27 non-null     float64
 6   유효
측정
일수
(day)   27 non-null     int64  
 7   유효
측정
시간
(hour)  27 non-null     int64  
 8   월평균
(ppm)        27 non-null     object 
 9   최저
(ppm)         27 non-null     float64
 10  최고
(ppm)         27 non-null     float64
 11  최고일시             27 non-null     int64  
 12  기준초과
(회)         0 non-null      float64
 13  초과율
(%)          0 non-null      float64
 14  최저
(ppm).1       27 non-null     float64
 15  최고
(ppm).1       27 non-null     float64
 16  최고일              27 non-null     int64  
dtypes: float64(8), int64(5), object(4)
memory usage: 3.7+ KB
None

import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
df_t.describe()


측정소코드장비\n가동률\n(%)유효\n자료\n획득률\n(%)유효\n측정\n일수\n(day)유효\n측정\n시간\n(hour)최저\n(ppm)최고\n(ppm)최고일시기준초과\n(회)초과율\n(%)최저\n(ppm).1최고\n(ppm).1최고일countmeanstdmin25%50%75%max
27.000000 27.000000 27.000000 27.000000 27.000000 27.000000 27.000000 2.700000e+01 0.0 0.0 27.000000 27.000000 2.700000e+01
404457.296296 99.584815 94.126296 28.148148 677.740741 0.000859 0.003452 2.022061e+09 NaN NaN 0.001063 0.001870 2.022061e+07
250800.529171 1.604751 9.712634 3.254626 69.926111 0.000382 0.001706 6.589991e+02 NaN NaN 0.000399 0.000611 6.877645e+00
131373.000000 92.300000 50.410000 14.000000 363.000000 0.000100 0.001800 2.022060e+09 NaN NaN 0.000300 0.001200 2.022060e+07
132992.500000 100.000000 95.965000 28.500000 691.000000 0.000700 0.002200 2.022060e+09 NaN NaN 0.000750 0.001500 2.022060e+07
437202.000000 100.000000 97.220000 29.000000 700.000000 0.000900 0.002800 2.022060e+09 NaN NaN 0.001000 0.001800 2.022060e+07
632426.000000 100.000000 97.565000 29.500000 702.500000 0.001000 0.003900 2.022061e+09 NaN NaN 0.001300 0.002050 2.022061e+07
831491.000000 100.000000 99.440000 30.000000 716.000000 0.001700 0.008100 2.022062e+09 NaN NaN 0.001800 0.004100 2.022062e+07
In [23]:
 

 


import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
df_t[['최고일','측정소코드','최고\n(ppm)']]


최고일측정소코드최고\n(ppm)01234567891011121314151617181920212223242526

20220601 831481 0.0037
20220605 831491 0.0067
20220602 131441 0.0041
20220601 131373 0.0024
20220608 131451 0.0021
20220601 131991 0.0024
20220601 132401 0.0022
20220601 632421 0.0021
20220601 132993 0.0025
20220601 132994 0.0022
20220602 632371 0.0025
20220602 632431 0.0026
20220625 132991 0.0030
20220601 132992 0.0018
20220601 633361 0.0021
20220601 633461 0.0056
20220610 534461 0.0036
20220602 534341 0.0019
20220610 735351 0.0028
20220601 735172 0.0029
20220604 336451 0.0081
20220601 437202 0.0064
20220620 437401 0.0028
20220621 437411 0.0021
20220602 238241 0.0037
20220611 238191 0.0054
20220601 238481 0.0055

import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
print(df_t['최고\n(ppm)'].values)
print(type(df_t['최고\n(ppm)'].values))
df_t[1:7].values


[0.0037 0.0067 0.0041 0.0024 0.0021 0.0024 0.0022 0.0021 0.0025 0.0022
 0.0025 0.0026 0.003  0.0018 0.0021 0.0056 0.0036 0.0019 0.0028 0.0029
 0.0081 0.0064 0.0028 0.0021 0.0037 0.0054 0.0055]
<class 'numpy.ndarray'>
Out[4]:
array([['인천', '옹진군', '덕적도', 831491, 100.0, 95.55, 28, 688, '0.0015',
        0.001, 0.0067, 2022061003, nan, nan, 0.0012, 0.0023, 20220605],
       ['경기', '이천', '설성면', 131441, 100.0, 97.22, 29, 700, '0.0015',
        0.0009, 0.0041, 2022061111, nan, nan, 0.0011, 0.0021, 20220602],
       ['경기', '파주', '파주', 131373, 100.0, 97.5, 30, 702, '0.0012', 0.0009,
        0.0024, 2022060319, nan, nan, 0.001, 0.0015, 20220601],
       ['경기', '포천', '관인면', 131451, 100.0, 97.91, 30, 705, '0.0011',
        0.0005, 0.0021, 2022061710, nan, nan, 0.0006, 0.0013, 20220608],
       ['경기', '연천', '연천(DMZ)', 131991, 100.0, 97.63, 29, 703, '0.0011',
        0.0007, 0.0024, 2022061710, nan, nan, 0.0009, 0.0015, 20220601],
       ['강원', '양구', '방산면', 132401, 100.0, 97.5, 29, 702, '0.0008',
        0.0007, 0.0022, 2022060211, nan, nan, 0.0007, 0.0012, 20220601]],
      dtype=object)

import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')

df_t[1:7]


시.도시군구측정소명측정소코드장비\n가동률\n(%)유효\n자료\n획득률\n(%)유효\n측정\n일수\n(day)유효\n측정\n시간\n(hour)월평균\n(ppm)최저\n(ppm)최고\n(ppm)최고일시기준초과\n(회)초과율\n(%)최저\n(ppm).1최고\n(ppm).1최고일123456
인천 옹진군 덕적도 831491 100.0 95.55 28 688 0.0015 0.0010 0.0067 2022061003 NaN NaN 0.0012 0.0023 20220605
경기 이천 설성면 131441 100.0 97.22 29 700 0.0015 0.0009 0.0041 2022061111 NaN NaN 0.0011 0.0021 20220602
경기 파주 파주 131373 100.0 97.50 30 702 0.0012 0.0009 0.0024 2022060319 NaN NaN 0.0010 0.0015 20220601
경기 포천 관인면 131451 100.0 97.91 30 705 0.0011 0.0005 0.0021 2022061710 NaN NaN 0.0006 0.0013 20220608
경기 연천 연천(DMZ) 131991 100.0 97.63 29 703 0.0011 0.0007 0.0024 2022061710 NaN NaN 0.0009 0.0015 20220601
강원 양구 방산면 132401 100.0 97.50 29 702 0.0008 0.0007 0.0022 2022060211 NaN NaN 0.0007 0.0012 20220601

import pandas as pd
df_t=pd.read_csv('testFile.csv',encoding='cp949')
df_t[df_t['최고일']==20220601]


시.도시군구측정소명측정소코드장비\n가동률\n(%)유효\n자료\n획득률\n(%)유효\n측정\n일수\n(day)유효\n측정\n시간\n(hour)월평균\n(ppm)최저\n(ppm)최고\n(ppm)최고일시기준초과\n(회)초과율\n(%)최저\n(ppm).1최고\n(ppm).1최고일0356789131415192126

인천 강화군 석모리 831481 100.0 77.63 22 559 0.0013 0.0010 0.0037 2022060112 NaN NaN 0.0011 0.0019 20220601
경기 파주 파주 131373 100.0 97.50 30 702 0.0012 0.0009 0.0024 2022060319 NaN NaN 0.0010 0.0015 20220601
경기 연천 연천(DMZ) 131991 100.0 97.63 29 703 0.0011 0.0007 0.0024 2022061710 NaN NaN 0.0009 0.0015 20220601
강원 양구 방산면 132401 100.0 97.50 29 702 0.0008 0.0007 0.0022 2022060211 NaN NaN 0.0007 0.0012 20220601
강원 고성 간성읍 632421 100.0 91.11 27 656 0.0011 0.0008 0.0021 2022060210 NaN NaN 0.0008 0.0015 20220601
강원 고성 인제(DMZ) 132993 100.0 97.50 29 702 0.0017 0.0012 0.0025 2022060211 NaN NaN 0.0015 0.0020 20220601
강원 고성 고성(DMZ) 132994 100.0 97.91 30 705 0.0011 0.0007 0.0022 2022060202 NaN NaN 0.0010 0.0015 20220601
강원 화천 화천(DMZ) 132992 100.0 99.44 30 716 0.0013 0.0010 0.0018 2022060107 NaN NaN 0.0011 0.0016 20220601
충북 괴산 청천면 633361 100.0 96.80 29 697 0.001 0.0007 0.0021 2022060815 NaN NaN 0.0008 0.0012 20220601
충북 음성 금왕 633461 100.0 96.80 29 697 0.001 0.0006 0.0056 2022060111 NaN NaN 0.0007 0.0018 20220601
전북 부안 새만금 735172 100.0 97.08 29 699 0.0016 0.0005 0.0029 2022061013 NaN NaN 0.0006 0.0026 20220601
경북 영덕 강구면 437202 100.0 95.27 28 686 0.001 0.0004 0.0064 2022060121 NaN NaN 0.0007 0.0021 20220601
경남 거창 남상면 238481 92.3 90.00 27 648 0.0021 0.0017 0.0055 2022060120 NaN NaN 0.0018 0.0028 20220601

' > 파이썬 데이터과학통계학습(23.01.03-23.01.09)(정보문화사)' 카테고리의 다른 글

part04분류-2  (0) 2023.01.08
part04분류-1  (0) 2023.01.08
part03회귀(05~07)  (0) 2023.01.07
part03회귀(01~04)  (0) 2023.01.06
Part01 데이터 과학을 위한 체계  (0) 2023.01.04