본문 바로가기
Data Analysis/Python

Pandas Cheatsheet: 125+ exercises

by Hagrid 2022. 12. 29.
반응형
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/rj-sample-datasets/sample.db
/kaggle/input/rj-sample-datasets/sample-transaction.csv
/kaggle/input/rj-sample-datasets/transaction-1.csv
/kaggle/input/rj-sample-datasets/Students Score - Sheet1.csv
/kaggle/input/rj-sample-datasets/transaction-2.csv
/kaggle/input/datasets-for-pandas/uk-500.csv
/kaggle/input/datasets-for-pandas/abc.csv
/kaggle/input/datasets-for-pandas/abc2.csv
/kaggle/input/datasets-for-pandas/data1.csv
/kaggle/input/datasets-for-pandas/score.csv
/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv
/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv
In [2]:
import pandas
import pandas as pd
import numpy
import numpy as np
import random as rn
import functools
import re

import warnings
warnings.filterwarnings('ignore')

Content List

In [3]:
 
# for i in range(105, 132):
#     print(f'* [{i}. XY](#{i})  ')
In [4]:
 
import pandas
import pandas as pd
import numpy
import numpy as np
import random as rn
import functools
import re
 
 
Task 1: Check Pandas Version
In [5]:
 
print('Task 1:')  
print(pd.__version__)
Task 1:
1.1.1
 
Task 2: Create Numpy Array
Create three columns with Zero values 컬럼 3개 0값으로 만들기 
 
In [6]:
 
print('Task 2:')
dtype = [('Col1','int32'), ('Col2','float32'), ('Col3','float32')]
values = numpy.zeros(20, dtype=dtype)
index = ['Row'+str(i) for i in range(1, len(values)+1)]

df = pandas.DataFrame(values, index=index)
print(df)

df = pandas.DataFrame(values)
print(df)
Task 2:
       Col1  Col2  Col3
Row1      0   0.0   0.0
Row2      0   0.0   0.0
Row3      0   0.0   0.0
Row4      0   0.0   0.0
Row5      0   0.0   0.0
Row6      0   0.0   0.0
Row7      0   0.0   0.0
Row8      0   0.0   0.0
Row9      0   0.0   0.0
Row10     0   0.0   0.0
Row11     0   0.0   0.0
Row12     0   0.0   0.0
Row13     0   0.0   0.0
Row14     0   0.0   0.0
Row15     0   0.0   0.0
Row16     0   0.0   0.0
Row17     0   0.0   0.0
Row18     0   0.0   0.0
Row19     0   0.0   0.0
Row20     0   0.0   0.0
    Col1  Col2  Col3
0      0   0.0   0.0
1      0   0.0   0.0
2      0   0.0   0.0
3      0   0.0   0.0
4      0   0.0   0.0
5      0   0.0   0.0
6      0   0.0   0.0
7      0   0.0   0.0
8      0   0.0   0.0
9      0   0.0   0.0
10     0   0.0   0.0
11     0   0.0   0.0
12     0   0.0   0.0
13     0   0.0   0.0
14     0   0.0   0.0
15     0   0.0   0.0
16     0   0.0   0.0
17     0   0.0   0.0
18     0   0.0   0.0
19     0   0.0   0.0
 
Task 3: iLoc in Pandas / Print first five rows
print('Task 3:')
df = pandas.read_csv('../input/datasets-for-pandas/data1.csv', sep=';', header=None)
print(df.iloc[:4]) # 0 - 4 = 5 values
Task 3:
                       0
0  capacity,score,length
1                1,10,30
2                2,20,30
3                3,30,40
 
Task 4: Create Random integer between 2 to 10 with 4 items
print('Task 4:')
values = np.random.randint(2, 10, size=4)
print(values)
Task 4:
[3 7 8 4]
 
Task 5: Create Random integer between 0 to 100 랜덤 정수 만들기0과100사이 
 
print('Task 5:')
df = pd.DataFrame(np.random.randint(0, 100, size=(3, 2)), columns=list('xy'))
print(df)
Task 5:
    x   y
0  14  26
1  76  65
2  58  25
 
Task 6: Create Random integer between 2 to 10 with 4 columns / 2에서 10 사이 값으로 컬럼4개 랜덤한 정수값으로 만들기 
 
In [10]:
 
print('Task 6:')
df = pd.DataFrame(np.random.randint(0, 100, size=(2, 4)), columns=['A', 'B', 'C', 'D'])
print(df)

numpy randint  = 랜덤하게 선택함 파라미터 ~부터 ~까지 / 사이즈는 행과 열 

Task 6:
   A   B   C   D
0  5  94  84  21
1  3  15  10  97
 
Task 7: 2D array with random between 0 and 5
 
In [11]:
 
print('Task 7:')
values = np.random.randint(5, size=(2, 4))
print(values)
print(type(values))
Task 7:
[[2 0 3 1]
 [3 3 1 1]]
<class 'numpy.ndarray'>
 
Task 8: Create Random integer between 0 to 100 with 10 itmes (2 rows, 5 columns)
 
In [12]:
 
print('Task 8:')
df = pd.DataFrame(np.random.randint(0, 100, size=(3, 5)), columns=['Toronto', 'Ottawa', 'Calgary', 'Montreal', 'Quebec'])
print(df)
Task 8:
   Toronto  Ottawa  Calgary  Montreal  Quebec
0       23      84       12         8      73
1       35       4       94        74      89
2       14      62       57        97      72
 
Task 9:

3 rows, 2 columns in pandas
1st column = random between 10 to 20
2nd column = random between 80 and 90
3rd column = random between 40 and 50
 
In [13]:
 
print('Task 9:')  
dtype = [('One','int32'), ('Two','int32')]
values = np.zeros(3, dtype=dtype)
index = ['Row'+str(i) for i in range(1, 4)]

df = pandas.DataFrame(values, index=index)
print(df)
Task 9:
      One  Two
Row1    0    0
Row2    0    0
Row3    0    0
 
Task 10:

Fill Random Science and Math Marks

(has some bugs in it)
In [14]:
 
print('Task 10:')  
dtype = [('Science','int32'), ('Maths','int32')]
values = np.zeros(3, dtype=dtype)

#print(type(dtype))
#values = np.random.randint(5, size=(3, 2))
#print(values)
#index = ['Row'+str(i) for i in range(1, 4)]

df = pandas.DataFrame(values, index=index)
print(df)
Task 10:
      Science  Maths
Row1        0      0
Row2        0      0
Row3        0      0
 
Task 11:

CSV to DatRaframe (from_csv)
Note: from_csv is Deprecated since version 0.21.0: Use pandas.read_csv() instead.
In [15]:
 
print('Task 11:')  

csv = pd.read_csv('../input/datasets-for-pandas/uk-500.csv')
print(csv.head())
Task 11:
  first_name   last_name             company_name            address  \
0    Aleshia  Tomkiewicz  Alan D Rosenburg Cpa Pc       14 Taylor St   
1       Evan   Zigomalas       Cap Gemini America        5 Binney St   
2     France     Andrade      Elliott, John W Esq       8 Moor Place   
3    Ulysses   Mcwalters           Mcmahan, Ben L      505 Exeter Rd   
4     Tyisha      Veness           Champagne Room  5396 Forth Street   

                             city           county    postal        phone1  \
0               St. Stephens Ward             Kent   CT2 7PP  01835-703597   
1                      Abbey Ward  Buckinghamshire  HP11 2AX  01937-864715   
2  East Southbourne and Tuckton W      Bournemouth   BH6 3BE  01347-368222   
3              Hawerby cum Beesby     Lincolnshire  DN36 5RP  01912-771311   
4      Greets Green and Lyng Ward    West Midlands   B70 9DT  01547-429341   

         phone2                       email  \
0  01944-369967     atomkiewicz@hotmail.com   
1  01714-737668    evan.zigomalas@gmail.com   
2  01935-821636  france.andrade@hotmail.com   
3  01302-601380         ulysses@hotmail.com   
4  01290-367248   tyisha.veness@hotmail.com   

                                    web  
0  http://www.alandrosenburgcpapc.co.uk  
1     http://www.capgeminiamerica.co.uk  
2      http://www.elliottjohnwesq.co.uk  
3          http://www.mcmahanbenl.co.uk  
4        http://www.champagneroom.co.uk  
 
Task 12:

CSV to Dataframe (from_csv
In [16]:
 
print('Task 12:')  
#df = df.from_csv(path, header, sep, index_col, parse_dates, encoding, tupleize_cols, infer_datetime_format)
df = pd.read_csv('../input/datasets-for-pandas/uk-500.csv')
print(df.head())
Task 12:
  first_name   last_name             company_name            address  \
0    Aleshia  Tomkiewicz  Alan D Rosenburg Cpa Pc       14 Taylor St   
1       Evan   Zigomalas       Cap Gemini America        5 Binney St   
2     France     Andrade      Elliott, John W Esq       8 Moor Place   
3    Ulysses   Mcwalters           Mcmahan, Ben L      505 Exeter Rd   
4     Tyisha      Veness           Champagne Room  5396 Forth Street   

                             city           county    postal        phone1  \
0               St. Stephens Ward             Kent   CT2 7PP  01835-703597   
1                      Abbey Ward  Buckinghamshire  HP11 2AX  01937-864715   
2  East Southbourne and Tuckton W      Bournemouth   BH6 3BE  01347-368222   
3              Hawerby cum Beesby     Lincolnshire  DN36 5RP  01912-771311   
4      Greets Green and Lyng Ward    West Midlands   B70 9DT  01547-429341   

         phone2                       email  \
0  01944-369967     atomkiewicz@hotmail.com   
1  01714-737668    evan.zigomalas@gmail.com   
2  01935-821636  france.andrade@hotmail.com   
3  01302-601380         ulysses@hotmail.com   
4  01290-367248   tyisha.veness@hotmail.com   

                                    web  
0  http://www.alandrosenburgcpapc.co.uk  
1     http://www.capgeminiamerica.co.uk  
2      http://www.elliottjohnwesq.co.uk  
3          http://www.mcmahanbenl.co.uk  
4        http://www.champagneroom.co.uk  
 
Task 13:

First 4 rows and 2 columns of CSV
In [17]:
 
print('Task 13:') 
df = pandas.read_csv('../input/datasets-for-pandas/data1.csv', sep=',')
print(df.shape) 
#print(df[2:14])
print(df.iloc[0:4,0:2])
#print(df[df.columns[0]])
Task 13:
(9, 3)
   capacity  score
0         1     10
1         2     20
2         3     30
3         3     40
 
Task 14:

Show even rows and first three columns
In [18]:
 
print('Task 14:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)

print(df.iloc[::2, 0:3])    
Task 14:
(7, 5)
  student  language  science
0   kumar        90       56
2   sammy        90       23
4   peter        30       56
6  carrol        50       90
 
Task 15:

New columns as sum of all
In [19]:
 
print('Task 15:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
print(df) 
df['total'] = df.sum(axis=1)

print(df)
Task 15:
(7, 5)
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4   peter        30       56     45       65
5    prem        90       45     45       34
6  carrol        50       90     45       23
  student  language  science  maths  history  total
0   kumar        90       56     34       34    214
1   kevin        10       34     32       67    143
2   sammy        90       23     12       32    157
3  janice        20       67     90       45    222
4   peter        30       56     45       65    196
5    prem        90       45     45       34    214
6  carrol        50       90     45       23    208
 
Task 16:

Delete Rows of one column where the value is less than 50
In [20]:
 
print('Task 16:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
print(df) 

df = df[df.science > 50]
print(df)
Task 16:
(7, 5)
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4   peter        30       56     45       65
5    prem        90       45     45       34
6  carrol        50       90     45       23
  student  language  science  maths  history
0   kumar        90       56     34       34
3  janice        20       67     90       45
4   peter        30       56     45       65
6  carrol        50       90     45       23
 
Task 17:

Delete with Query
Note: Query doesn't work if your column has space in it
In [21]:
 
print('Task 17:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
print(df) 

df = df.query('science > 45')
print(df)
Task 17:
(7, 5)
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4   peter        30       56     45       65
5    prem        90       45     45       34
6  carrol        50       90     45       23
  student  language  science  maths  history
0   kumar        90       56     34       34
3  janice        20       67     90       45
4   peter        30       56     45       65
6  carrol        50       90     45       23
 
Task 18:

Skip single row
In [22]:
 
print('Task 18:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8", skiprows=[5])
print(df.shape)
print(df)
Task 18:
(6, 5)
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4    prem        90       45     45       34
5  carrol        50       90     45       23
 
Task 19:

Skip multiple rows
In [23]:
 
print('Task 19:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8", skiprows=[1, 5, 7])
print(df.shape)
#print(df) 

#df = df[df[[1]] > 45]
print(df)
Task 19:
(4, 5)
  student  language  science  maths  history
0   kevin        10       34     32       67
1   sammy        90       23     12       32
2  janice        20       67     90       45
3    prem        90       45     45       34
 
Task 20:

Select Column by Index

Note:
df[[1]] doesn't work in Pandas updated version (need to double check)
New columns as sum of all
In [24]:
 
print('Task 20:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
print(df) 

#df = df[int(df.columns[2]) > 45]
print(df)
print(type(df.columns[2]))
Task 20:
(7, 5)
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4   peter        30       56     45       65
5    prem        90       45     45       34
6  carrol        50       90     45       23
  student  language  science  maths  history
0   kumar        90       56     34       34
1   kevin        10       34     32       67
2   sammy        90       23     12       32
3  janice        20       67     90       45
4   peter        30       56     45       65
5    prem        90       45     45       34
6  carrol        50       90     45       23
<class 'str'>
 
Task 21:

Skip rows
Note:
df[[1]] doesn't work in Pandas updated version (need to double check)
In [25]:
 
print('Task 21:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8", skiprows=[0])
print(df.shape)
print(df) 

#df = df[int(df.columns[2]) > 45]
#print(df)
print(df.columns[2])
Task 21:
(6, 5)
    kumar  90  56  34  34.1
0   kevin  10  34  32    67
1   sammy  90  23  12    32
2  janice  20  67  90    45
3   peter  30  56  45    65
4    prem  90  45  45    34
5  carrol  50  90  45    23
56
 
Task 22:

String to Dataframe
Note:
df[[1]] doesn't work in Pandas updated version (need to double check)
In [26]:
 
print('Task 22:')
from io import StringIO

s = """
        1, 2
        3, 4
        5, 6
    """

df = pd.read_csv(StringIO(s), header=None)

print(df.shape)
print(df)
Task 22:
(3, 2)
   0  1
0  1  2
1  3  4
2  5  6
 
Task 23:

New columns as max of other columns
float to int used
In [27]:
 
print('Task 23:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
df['sum'] = df.sum(axis=1)
df['max'] = df.max(axis=1)
df['min'] = df.min(axis=1)
df['average'] = df.mean(axis=1).astype(int)
print(df)
Task 23:
(7, 5)
  student  language  science  maths  history  sum  max  min  average
0   kumar        90       56     34       34  214  214   34       96
1   kevin        10       34     32       67  143  143   10       62
2   sammy        90       23     12       32  157  157   12       69
3  janice        20       67     90       45  222  222   20       98
4   peter        30       56     45       65  196  196   30       88
5    prem        90       45     45       34  214  214   34       96
6  carrol        50       90     45       23  208  208   23       92
 
Task 24:

New columns as max of other columns
float to int used
Math is considered more, so double the marks for maths
In [28]:
 
def apply_math_special(row):
    return (row.maths * 2 + row.language / 2 + row.history / 3 + row.science) / 4                

print('Task 24:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)
df['sum'] = df.sum(axis=1)
df['max'] = df.max(axis=1)
df['min'] = df.min(axis=1)
df['average'] = df.mean(axis=1).astype(int)
df['math_special'] = df.apply(apply_math_special, axis=1).astype(int)
print(df)
Task 24:
(7, 5)
  student  language  science  maths  history  sum  max  min  average  \
0   kumar        90       56     34       34  214  214   34       96   
1   kevin        10       34     32       67  143  143   10       62   
2   sammy        90       23     12       32  157  157   12       69   
3  janice        20       67     90       45  222  222   20       98   
4   peter        30       56     45       65  196  196   30       88   
5    prem        90       45     45       34  214  214   34       96   
6  carrol        50       90     45       23  208  208   23       92   

   math_special  
0            45  
1            31  
2            25  
3            68  
4            45  
5            47  
6            53  
 
Task 25:

New columns as max of other columns
35 marks considered as pass
If the student fails in math, consider fail
If the student passes in language and science, consider as pass
In [29]:
 
def pass_one_subject(row):
    if(row.maths > 34):
        return 'Pass'
    if(row.language > 34 and row.science > 34):
        return 'Pass'
    
    return 'Fail'                

print('Task 25:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc.csv', sep=',', encoding = "utf-8")
print(df.shape)   

df['pass_one'] = df.apply(pass_one_subject, axis=1)
print(df)
Task 25:
(7, 5)
  student  language  science  maths  history pass_one
0   kumar        90       56     34       34     Pass
1   kevin        10       34     32       67     Fail
2   sammy        90       23     12       32     Fail
3  janice        20       67     90       45     Pass
4   peter        30       56     45       65     Pass
5    prem        90       45     45       34     Pass
6  carrol        50       90     45       23     Pass
 
Task 26:

Fill with average
In [30]:
 
print('Task 26:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/abc2.csv', sep=',', encoding = "utf-8")
print(df.shape)   
print(df)
df.fillna(df.mean(), inplace=True)

#df['pass_one'] = df.apply(pass_one_subject, axis=1)
print(df)
Task 26:
(7, 5)
  student  language  science  maths  history
0   kumar        90     56.0   34.0       34
1   kevin        10      NaN   32.0       67
2   sammy        90     23.0   12.0       32
3  janice        20      NaN   90.0       45
4   peter        30     56.0   45.0       65
5    prem        90     45.0    NaN       34
6  carrol        50     90.0   45.0       23
  student  language  science  maths  history
0   kumar        90     56.0   34.0       34
1   kevin        10     54.0   32.0       67
2   sammy        90     23.0   12.0       32
3  janice        20     54.0   90.0       45
4   peter        30     56.0   45.0       65
5    prem        90     45.0   43.0       34
6  carrol        50     90.0   45.0       23
 
Task 27:

New columns as sum of all
In [31]:
 
print('Task 27:')
df = pd.DataFrame(np.random.rand(10, 5))
df.iloc[0:3, 0:4] = np.nan # throw in some na values
print(df)
df.loc[:, 'test'] = df.iloc[:, 2:].sum(axis=1)
print(df)
Task 27:
          0         1         2         3         4
0       NaN       NaN       NaN       NaN  0.572887
1       NaN       NaN       NaN       NaN  0.726606
2       NaN       NaN       NaN       NaN  0.898058
3  0.820883  0.281087  0.061915  0.251880  0.690323
4  0.827387  0.366752  0.472221  0.344950  0.760725
5  0.826558  0.940475  0.564082  0.638462  0.313050
6  0.082206  0.610940  0.716961  0.895827  0.446285
7  0.235996  0.154126  0.869530  0.586864  0.915977
8  0.403853  0.771666  0.408973  0.085731  0.588707
9  0.001154  0.417254  0.155335  0.893756  0.825199
          0         1         2         3         4      test
0       NaN       NaN       NaN       NaN  0.572887  0.572887
1       NaN       NaN       NaN       NaN  0.726606  0.726606
2       NaN       NaN       NaN       NaN  0.898058  0.898058
3  0.820883  0.281087  0.061915  0.251880  0.690323  1.004118
4  0.827387  0.366752  0.472221  0.344950  0.760725  1.577897
5  0.826558  0.940475  0.564082  0.638462  0.313050  1.515594
6  0.082206  0.610940  0.716961  0.895827  0.446285  2.059073
7  0.235996  0.154126  0.869530  0.586864  0.915977  2.372371
8  0.403853  0.771666  0.408973  0.085731  0.588707  1.083410
9  0.001154  0.417254  0.155335  0.893756  0.825199  1.874290
 
Task 28:

Unicode issue and fix
In [32]:
 
print('Task 28:') 
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/score.csv', sep=',', encoding = "ISO-8859-1")
print(df.shape) 
Task 28:
(16, 5)
 
Task 29:

Fill with average
In [33]:
 
print('Task 29:') 
df = pd.DataFrame(np.random.rand(3,4), columns=list("ABCD"))
print(df.shape)   
print(df)
df.fillna(df.mean(), inplace=True)

print(df)
Task 29:
(3, 4)
          A         B         C         D
0  0.530069  0.327372  0.535214  0.532122
1  0.530380  0.795693  0.533263  0.174414
2  0.655037  0.247654  0.377571  0.344309
          A         B         C         D
0  0.530069  0.327372  0.535214  0.532122
1  0.530380  0.795693  0.533263  0.174414
2  0.655037  0.247654  0.377571  0.344309
 
Task 30:

Last 4 rows
In [34]:
 
print('Task 30:')  
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/data1.csv', sep=';') 
print(df[-4:])
Task 30:
  capacity,score,length
5               7,10,23
6               3,20,22
7               8,20,11
8                2,30,2
 
Task 31:

Expanding Apply
In [35]:
 
print('Task 31:')
series1 = pd.Series([i / 100.0 for i in range(1,6)])
print(series1)
def CumRet(x,y):
    return x * (1 + y)
def Red(x):
    return functools.reduce(CumRet,x,1.0)
s2 = series1.expanding().apply(Red)
# s2 = series1.expanding().apply(Red, raw=True) # is not working
print(s2)
Task 31:
0    0.01
1    0.02
2    0.03
3    0.04
4    0.05
dtype: float64
0    1.010000
1    1.030200
2    1.061106
3    1.103550
4    1.158728
dtype: float64
 
Task 32:

Get 3 and 4th row
In [36]:
 
print('Task 32:')  
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/data1.csv', sep=';') 
print(df[2:4])
Task 32:
  capacity,score,length
2               3,30,40
3               3,40,30
 
Task 33:

Last 4th to 1st
In [37]:
 
print('Task 33:')  
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/data1.csv', sep=';') 
print(df[-4:-1])
Task 33:
  capacity,score,length
5               7,10,23
6               3,20,22
7               8,20,11
 
Task 34:

iloc position slice
In [38]:
 
print('Task 34:')  
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/data1.csv', sep=';') 
print(df.iloc[1:9])
Task 34:
  capacity,score,length
1               2,20,30
2               3,30,40
3               3,40,30
4               2,30,40
5               7,10,23
6               3,20,22
7               8,20,11
8                2,30,2
 
Task 35:

Loc - iloc - ix - at - iat
In [39]:
 
print('Task 35:')  
df = pandas.read_csv('/kaggle/input/datasets-for-pandas/data1.csv', sep=';')
Task 35:
 
Task 36:

Random data
In [40]:
 
print('Task 36:')  
def xrange(x):
    return iter(range(x))

rnd_1  =  [ rn.randrange ( 1 , 20 )  for  x  in  xrange ( 1000 )] 
rnd_2  =  [ rn.randrange ( 1 , 20 )  for  x  in  xrange ( 1000 )] 
rnd_3  =  [ rn.randrange ( 1 , 20 )  for  x in  xrange ( 1000 )] 
date  =  pd . date_range ( '2012-4-10' ,  '2015-1-4' )
print(len(date))
data  =  pd . DataFrame ({ 'date' : date ,  'rnd_1' :  rnd_1 ,  'rnd_2' :  rnd_2 ,  'rnd_3' :  rnd_3 })

data.head()
Task 36:
1000
Out[40]:
daternd_1rnd_2rnd_301234
2012-04-10 19 2 17
2012-04-11 19 13 2
2012-04-12 16 9 3
2012-04-13 1 19 19
2012-04-14 7 15 16
 
Task 37:

Filter with the value comparison
In [41]:
 
print('Task 37:')
below_20 = data[data['rnd_1'] < 20]    
print(below_20)
Task 37:
          date  rnd_1  rnd_2  rnd_3
0   2012-04-10     19      2     17
1   2012-04-11     19     13      2
2   2012-04-12     16      9      3
3   2012-04-13      1     19     19
4   2012-04-14      7     15     16
..         ...    ...    ...    ...
995 2014-12-31     10     18     13
996 2015-01-01     19      9      7
997 2015-01-02      6      6     13
998 2015-01-03      4      9      1
999 2015-01-04      1     16     15

[1000 rows x 4 columns]
 
Task 38:

Filter between 5 and 10 on col 1
In [42]:
 
print('Task 38:') 
def xrange(x):
    return iter(range(x))
rnd_1  =  [ rn.randrange ( 1 , 20 )  for  x  in  xrange ( 1000 )] 
rnd_2  =  [ rn.randrange ( 1 , 20 )  for  x  in  xrange ( 1000 )] 
rnd_3  =  [ rn.randrange ( 1 , 20 )  for  x in  xrange ( 1000 )] 
date  =  pd . date_range ( '2012-4-10' ,  '2015-1-4' )
print(len(date))
data  =  pd . DataFrame ({ 'date' : date ,  'rnd_1' :  rnd_1 ,  'rnd_2' :  rnd_2 ,  'rnd_3' :  rnd_3 })
below_20 = data[data['rnd_1'] < 20]
ten_to_20 = data[(data['rnd_1'] >= 5) & (data['rnd_1'] < 10)]
#print(ten_to_20)
Task 38:
1000
 
Task 39:

Filter between 15 to 20
In [43]:
 
print('Task 39:')      
date  =  pd . date_range ( '2018-08-01' ,  '2018-08-15' )
date_count = len(date)

def fill_rand(start, end, count):
    return [rn.randrange(1, 20 ) for x in xrange( count )]

rnd_1 = fill_rand(1, 20, date_count) 
rnd_2 = fill_rand(1, 20, date_count) 
rnd_3 = fill_rand(1, 20, date_count)
#print(len(date))
data  =  pd . DataFrame ({ 'date' : date ,  'rnd_1' :  rnd_1 ,  'rnd_2' :  rnd_2 ,  'rnd_3' :  rnd_3 })
#print(len(date))
ten_to_20 = data[(data['rnd_1'] >= 15) & (data['rnd_1'] < 20)]
print(ten_to_20)
Task 39:
         date  rnd_1  rnd_2  rnd_3
0  2018-08-01     15     15     11
1  2018-08-02     15     14     16
2  2018-08-03     18      2     15
5  2018-08-06     19     18      9
10 2018-08-11     18      4      7
 
Task 40:

15 to 33
In [44]:
 
print('Task 40:')      
date  =  pd . date_range ( '2018-08-01' ,  '2018-08-15' )
date_count = len(date)

def fill_rand(start, end, count):
    return [rn.randrange(1, 20 ) for x in xrange( count )]

rnd_1 = fill_rand(1, 20, date_count) 
rnd_2 = fill_rand(1, 20, date_count) 
rnd_3 = fill_rand(1, 20, date_count)

data  =  pd . DataFrame ({ 'date' : date ,  'rnd_1' :  rnd_1 ,  'rnd_2' :  rnd_2 ,  'rnd_3' :  rnd_3 })

ten_to_20 = data[(data['rnd_1'] >= 15) & (data['rnd_1'] < 33)]
print(ten_to_20)
Task 40:
         date  rnd_1  rnd_2  rnd_3
3  2018-08-04     18      7     14
4  2018-08-05     19     10     13
6  2018-08-07     15      5     15
8  2018-08-09     16     13     15
12 2018-08-13     17      3     15
 
Task 41:

Custom method and xrnage on dataframe
In [45]:
 
print('Task 41:')  
date  =  pd . date_range ( '2018-08-01' ,  '2018-08-15' )
date_count = len(date)

def xrange(x):
    return iter(range(x))

def fill_rand(start, end, count):
    return [rn.randrange(1, 20 ) for x in xrange( count )]

rnd_1 = fill_rand(1, 20, date_count) 
rnd_2 = fill_rand(1, 20, date_count) 
rnd_3 = fill_rand(1, 20, date_count)

data  =  pd . DataFrame ({ 'date' : date ,  'rnd_1' :  rnd_1 ,  'rnd_2' :  rnd_2 ,  'rnd_3' :  rnd_3 })
filter_loc = data.loc[ 2 : 4 ,  [ 'rnd_2' ,  'date' ]]
print(filter_loc)
Task 41:
   rnd_2       date
2     17 2018-08-03
3      3 2018-08-04
4     15 2018-08-05
 
Task 42:

Set index with date column
In [46]:
 
print('Task 42:')
date_date = data.set_index( 'date' ) 
print(date_date.head())
Task 42:
            rnd_1  rnd_2  rnd_3
date                           
2018-08-01      7     11     16
2018-08-02      7      5      3
2018-08-03     11     17      7
2018-08-04      9      3      8
2018-08-05      9     15      1
 
Task 43:

Change columns based on other columns
In [47]:
 
print('Task 43:') 
df = pd.DataFrame({
    'a' : [1,2,3,4], 
    'b' : [9,8,7,6],
    'c' : [11,12,13,14]
})
print(df) 

print('changing on one column')
# Change columns
df.loc[df.a >= 2,'b'] = 9
print(df)
Task 43:
   a  b   c
0  1  9  11
1  2  8  12
2  3  7  13
3  4  6  14
changing on one column
   a  b   c
0  1  9  11
1  2  9  12
2  3  9  13
3  4  9  14
 
Task 44:

Change multiple columns based on one column values
In [48]:
 
print('Task 44:')  
print('changing on multipe columns')
df.loc[df.a > 2,['b', 'c']] = 45
print(df)
Task 44:
changing on multipe columns
   a   b   c
0  1   9  11
1  2   9  12
2  3  45  45
3  4  45  45
 
Task 45:

Pandas Mask
In [49]:
 
print('Task 45:')  
print(df)
df_mask = pd.DataFrame({
    'a' : [True] * 4, 
    'b' : [False] * 4,
    'c' : [True, False] * 2
})
print(df.where(df_mask,-1000))
Task 45:
   a   b   c
0  1   9  11
1  2   9  12
2  3  45  45
3  4  45  45
   a     b     c
0  1 -1000    11
1  2 -1000 -1000
2  3 -1000    45
3  4 -1000 -1000
 
Task 46:

Check high or low comparing the column against 5
In [50]:
 
print('Task 46:')
print(df)  
df['logic'] = np.where(df['a'] > 5, 'high', 'low')
print(df)
Task 46:
   a   b   c
0  1   9  11
1  2   9  12
2  3  45  45
3  4  45  45
   a   b   c logic
0  1   9  11   low
1  2   9  12   low
2  3  45  45   low
3  4  45  45   low
 
Task 47:

Student Marks (Pass or Fail)
In [51]:
 
print('Task 47:')
marks_df = pd.DataFrame({
    'Language' : [60, 45, 78, 4], 
    'Math' : [90, 80, 23, 60],
    'Science' : [45, 90, 95, 20]
});
print(marks_df)
marks_df['language_grade'] = np.where(marks_df['Language'] >= 50, 'Pass', 'Fail')
marks_df['math_grade'] = np.where(marks_df['Math'] >= 50, 'Pass', 'Fail')
marks_df['science_grade'] = np.where(marks_df['Science'] >= 50, 'Pass', 'Fail')
print(marks_df)
Task 47:
   Language  Math  Science
0        60    90       45
1        45    80       90
2        78    23       95
3         4    60       20
   Language  Math  Science language_grade math_grade science_grade
0        60    90       45           Pass       Pass          Fail
1        45    80       90           Fail       Pass          Pass
2        78    23       95           Pass       Fail          Pass
3         4    60       20           Fail       Pass          Fail
 
Task 48:

Get passed grades
In [52]:
 
print('Task 48:')  
marks_df = pd.DataFrame({
    'Language' : [60, 45, 78, 4], 
    'Math' : [90, 80, 23, 60],
    'Science' : [45, 90, 95, 20]
});
print(marks_df)
marks_df_passed_in_language = marks_df[marks_df.Language >=50 ]
print(marks_df_passed_in_language)
Task 48:
   Language  Math  Science
0        60    90       45
1        45    80       90
2        78    23       95
3         4    60       20
   Language  Math  Science
0        60    90       45
2        78    23       95
 
Task 49:

Students passed in Language and Math
In [53]:
 
print('Task 49:')  
marks_df_passed_in_lang_math = marks_df[(marks_df.Language >=50) & (marks_df.Math >= 50)]
print(marks_df_passed_in_lang_math)
Task 49:
   Language  Math  Science
0        60    90       45
 
Task 50:

Students passed in Language and Science
In [54]:
 
print('Task 50:')  
marks_df_passed_in_lang_and_sc = marks_df.loc[(marks_df.Language >=50) & (marks_df.Science >= 50)]
print(marks_df_passed_in_lang_and_sc)
Task 50:
   Language  Math  Science
2        78    23       95
 
Task 51:

Loc with Label oriented slicing
possible error:
pandas.errors.UnsortedIndexError
In [55]:
 
print('Task 51:')
stars = {
    'age' : [31, 23, 65, 50],
    'movies' : [51, 23, 87, 200],
    'awards' : [42, 12, 4, 78]
    }
star_names = ['dhanush', 'simbu', 'kamal', 'vikram']
stars_df = pd.DataFrame(data=stars, index=[star_names])
print(stars_df)
Task 51:
         age  movies  awards
dhanush   31      51      42
simbu     23      23      12
kamal     65      87       4
vikram    50     200      78
 
Task 52:

iloc with positional slicing
In [56]:
 
print('Task 52:')  
print(stars_df.iloc[1:3])
Task 52:
       age  movies  awards
simbu   23      23      12
kamal   65      87       4
 
Task 53:

Label between numbers
In [57]:
 
print('Task 53:')  
numbers = pd.DataFrame({
        'one' : [10, 50, 80, 40],
        'two' : [2, 6, 56, 45]
    },
    index = [12, 14, 16, 18])
print(numbers)

print('label between 12 and 16')
print(numbers.loc[12:16])

print('index between 1 and 3')
print(numbers.iloc[1:3])
Task 53:
    one  two
12   10    2
14   50    6
16   80   56
18   40   45
label between 12 and 16
    one  two
12   10    2
14   50    6
16   80   56
index between 1 and 3
    one  two
14   50    6
16   80   56
 
Task 54:

Stars with names
In [58]:
 
print('Task 54:') 
stars = {
    'age' : [31, 23, 65, 50],
    'movies' : [51, 23, 87, 200],
    'awards' : [42, 12, 4, 78]
    }
star_names = ['dhanush', 'simbu', 'kamal', 'vikram']
stars_df = pd.DataFrame(data=stars, index=[star_names])
numbers = pd.DataFrame({
        'one' : [10, 50, 80, 40],
        'two' : [2, 6, 56, 45]
    },
    index = [12, 14, 16, 18])
print(numbers)
Task 54:
    one  two
12   10    2
14   50    6
16   80   56
18   40   45
 
Task 55:

Row label selection
Age is above 25 and movies above 25
In [59]:
 
print('Task 55:')

age_movies_25 = stars_df[(stars_df.movies > 25 ) & (stars_df.age > 25)]  
print(age_movies_25)
Task 55:
         age  movies  awards
dhanush   31      51      42
kamal     65      87       4
vikram    50     200      78
 
Task 56:

Stars in in certain ages
In [60]:
 
print('Task 56:')  
custom_stars = stars_df[stars_df.age.isin([31, 65])]
print(custom_stars)
Task 56:
         age  movies  awards
dhanush   31      51      42
kamal     65      87       4
 
Task 57:

inverse opeartor
!( above one.45 and below two.50 )
In [61]:
 
print('Task 57:')  
print(numbers)
print(numbers[~( (numbers.one > 45) & (numbers.two < 50) )])
Task 57:
    one  two
12   10    2
14   50    6
16   80   56
18   40   45
    one  two
12   10    2
16   80   56
18   40   45
 
Task 58:

Apply custom function
In [62]:
 
print('Task 58:')
def GrowUp(x):
    avg_weight =  sum(x[x['size'] == 'series1'].weight * 1.5)
    avg_weight += sum(x[x['size'] == 'M'].weight * 1.25)
    avg_weight += sum(x[x['size'] == 'L'].weight)
    avg_weight /= len(x)
    return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])

animals_df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),
                   'size': list('SSMMMLL'),
                   'weight': [8, 10, 11, 1, 20, 12, 12],
                   'adult' : [False] * 5 + [True] * 2})

gb = animals_df.groupby(['animal'])

expected_df = gb.apply(GrowUp)
print(expected_df)
Task 58:
       size   weight  adult
animal                     
cat       L   9.4375   True
dog       L  12.5000   True
fish      L   1.2500   True
 
Task 59:

Group by single column
In [63]:
 
print('Task 59:')
weights = animals_df.groupby(['weight']).get_group(20)  
print(weights)
Task 59:
  animal size  weight  adult
4    dog    M      20  False
 
Task 60:

Creating new Columns using Applymap
Sides & applymap
In [64]:
 
print('Task 60:')
sides_df = pd.DataFrame({
    'a' : [1, 1, 2, 4],
    'b' : [2, 1, 3, 4]
    })  
print(sides_df)
source_cols = sides_df.columns
print(source_cols)
new_cols = [str(x)+"_side" for x in source_cols]
side_category = {
    1 : 'North',
    2 : 'East',
    3 : 'South', 
    4 : 'West'
    }
sides_df[new_cols] = sides_df[source_cols].applymap(side_category.get)
print(sides_df)
Task 60:
   a  b
0  1  2
1  1  1
2  2  3
3  4  4
Index(['a', 'b'], dtype='object')
   a  b a_side b_side
0  1  2  North   East
1  1  1  North  North
2  2  3   East  South
3  4  4   West   West
 
Task 61:

Replacing some values with mean of the rest of a group
In [65]:
 
print('Task 61:')  
df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})
print(df)

gb = df.groupby('A')

def replace(g):
    mask = g < 0
    g.loc[mask] = g[~mask].mean()
    return g

gbt = gb.transform(replace)

print(gbt)
Task 61:
   A  B
0  1  1
1  1 -1
2  2  1
3  2  2
     B
0  1.0
1  1.0
2  1.0
3  2.0
 
Task 62:

Students passed in Language or Science (any one subject)
In [66]:
 
print('Task 62:') 
marks_df = pd.DataFrame({
    'Language' : [60, 45, 78, 4], 
    'Math' : [90, 80, 23, 60],
    'Science' : [45, 90, 95, 20]
});
print(marks_df)
marks_df_passed_in_lang_or_sc = marks_df.loc[(marks_df.Language >=50) | (marks_df.Science >= 50)]
print(marks_df_passed_in_lang_or_sc)
Task 62:
   Language  Math  Science
0        60    90       45
1        45    80       90
2        78    23       95
3         4    60       20
   Language  Math  Science
0        60    90       45
1        45    80       90
2        78    23       95
 
Task 63:

possible errors:
TypeError: 'Series' objects are mutable, thus they cannot be hashed
In [67]:
 
print('Task 63:')  
marks_df['passed_one_subject'] = 'Fail' 
marks_df.loc[(marks_df.Language >=50) , 'passed_one_subject'] = 'Pass'
print(marks_df)
Task 63:
   Language  Math  Science passed_one_subject
0        60    90       45               Pass
1        45    80       90               Fail
2        78    23       95               Pass
3         4    60       20               Fail
 
Task 64:

argsort
Select rows with data closest to certain value using argsort
In [68]:
 
print('Task 64:')  
df = pd.DataFrame({
    "a": np.random.randint(0, 100, size=(5,)), 
    "b": np.random.randint(0, 70, size=(5,))
})
print(df)
par = 65
print('with argsort')
df1 = df.loc[(df.a-par).abs().argsort()]
print(df1)

print(df.loc[(df.b-2).abs().argsort()])
Task 64:
    a   b
0  44   5
1  77  53
2  43  48
3  49   7
4  24  60
with argsort
    a   b
1  77  53
3  49   7
0  44   5
2  43  48
4  24  60
    a   b
0  44   5
3  49   7
2  43  48
1  77  53
4  24  60
 
Task 65:

argsort with stars
old stars (near by 50 age) argsort
In [69]:
 
print('Task 65:')  
stars = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "movies": [2, 3, 90, 45, 34, 2] 
})
print(stars.loc[(stars.age - 50).abs().argsort()])
Task 65:
   age  movies
1   50       3
3   45      45
4   65      34
2   24      90
5   18       2
0   17       2
 
Task 66:

Argsort with actors
young stars (near by 17)
In [70]:
 
print('Task 66:')  
print(stars.loc[(stars.age - 17).abs().argsort()])
Task 66:
   age  movies
0   17       2
5   18       2
2   24      90
3   45      45
1   50       3
4   65      34
 
Task 67:

Binary operators
Stars with
younger than 19 - very young
more movies acted**
In [71]:
 
print('Task 67:')
stars = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "movies": [22, 33, 90, 75, 34, 2] 
})  
print(stars)
print('Young and more movies acted')
young = stars.age < 30    
more_movies = stars.movies > 30
young_more = [young, more_movies]
young_more_Criteria = functools.reduce(lambda x, y : x & y, young_more)
print(stars[young_more_Criteria])
Task 67:
   age  movies
0   17      22
1   50      33
2   24      90
3   45      75
4   65      34
5   18       2
Young and more movies acted
   age  movies
2   24      90
 
Task 68:

Young, Higher Salary, and Higher Position
In [72]:
 
print('Task 68:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8] 
})  
print(employees)
print('Young, Higher Salary, and Higher Position')
young = employees.age < 30
high_salary = employees.salary > 60
high_position = employees.grade > 6
young_salary_position = [young, high_salary, high_position]
young_salary_position_Criteria = functools.reduce(lambda x, y : x & y, young_salary_position)
print(employees[young_salary_position_Criteria])
Task 68:
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
Young, Higher Salary, and Higher Position
   age  salary  grade
0   17      75      7
2   24      90      9
5   18      78      8
 
Task 69:

Rename columns
In [73]:
 
print('Task 69:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8] 
})  
print(employees)
employees.rename(columns={'age': 'User Age', 'salary': 'Salary 2018'}, inplace=True)
print(employees)
Task 69:
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
   User Age  Salary 2018  grade
0        17           75      7
1        50           33      8
2        24           90      9
3        45          175      2
4        65          134      7
5        18           78      8
 
Task 70:

Add a new column
In [74]:
 
print('Task 70:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8] 
})  
print(employees)
employees['group'] = pd.Series(np.random.randn(len(employees)))
print(employees)
Task 70:
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
   age  salary  grade     group
0   17      75      7  0.090702
1   50      33      8 -1.199785
2   24      90      9 -1.363444
3   45     175      2 -0.083164
4   65     134      7  0.169881
5   18      78      8  1.006376
 
Task 71:

Drop a column
In [75]:
 
print('Task 71:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8] 
})  
print(employees)
employees['group'] = pd.Series(np.random.randn(len(employees)))
print(employees)
employees.drop(employees.columns[[0]], axis=1, inplace = True)
print(employees)
Task 71:
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
   age  salary  grade     group
0   17      75      7 -0.954404
1   50      33      8  0.588388
2   24      90      9 -0.439521
3   45     175      2 -2.856544
4   65     134      7  0.174647
5   18      78      8 -0.500666
   salary  grade     group
0      75      7 -0.954404
1      33      8  0.588388
2      90      9 -0.439521
3     175      2 -2.856544
4     134      7  0.174647
5      78      8 -0.500666
 
Task 72:

Drop multiple columns
In [76]:
 
print('Task 72:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8] 
})  
print(employees)
employees['group'] = pd.Series(np.random.randn(len(employees)))
print(employees)
employees.drop(employees.columns[[1, 2]], axis=1, inplace = True)
print(employees)
Task 72:
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
   age  salary  grade     group
0   17      75      7 -0.043871
1   50      33      8 -0.228043
2   24      90      9  0.084870
3   45     175      2  1.221017
4   65     134      7  1.226256
5   18      78      8 -1.456445
   age     group
0   17 -0.043871
1   50 -0.228043
2   24  0.084870
3   45  1.221017
4   65  1.226256
5   18 -1.456445
 
Task 73:

Drop first and last column
In [77]:
 
print('Task 73:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)
employees.drop(employees.columns[[0, len(employees.columns)-1]], axis=1, inplace = True)
print(employees)
Task 73:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
   salary  grade
0      75      7
1      33      8
2      90      9
3     175      2
4     134      7
5      78      8
 
Task 74:

Delete by pop function
In [78]:
 
print('Task 74:')  
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)
group = employees.pop('group')
print(employees)
print(group)
Task 74:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
   age  salary  grade
0   17      75      7
1   50      33      8
2   24      90      9
3   45     175      2
4   65     134      7
5   18      78      8
0    1
1    1
2    2
3    2
4    2
5    1
Name: group, dtype: int64
 
Task 75:

DataFrame.from_items
In [79]:
 
print('Task 75:')  
# df = pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6]), ('C', [7,8, 9])], orient='index', columns=['one', 'two', 'three'])
# print(df) # throwing error
Task 75:
 
Task 76:

Pandas to list
In [80]:
 
print('Task 76:')
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)  
employees_list1 = list(employees.columns.values) 
employees_list2 = employees.values.tolist()
#employees_list = list(employees)
print(employees_list1)
print(employees_list2)
Task 76:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
['age', 'salary', 'grade', 'group']
[[17, 75, 7, 1], [50, 33, 8, 1], [24, 90, 9, 2], [45, 175, 2, 2], [65, 134, 7, 2], [18, 78, 8, 1]]
 
Task 77:

Pandas rows to list
In [81]:
 
print('Task 77:')
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)  
employees_list2 = employees.values.tolist()
print(employees_list2)
print(type(employees_list2))
print(len(employees_list2))
Task 77:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
[[17, 75, 7, 1], [50, 33, 8, 1], [24, 90, 9, 2], [45, 175, 2, 2], [65, 134, 7, 2], [18, 78, 8, 1]]
<class 'list'>
6
 
Task 78:

XPandas rows to array
Note: as_matrix is deprecatedYZ
In [82]:
 
print('Task 78:')
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)  
employees_list2 = employees.values
print(employees_list2)
print(type(employees_list2))
print(employees_list2.shape)
Task 78:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
[[ 17  75   7   1]
 [ 50  33   8   1]
 [ 24  90   9   2]
 [ 45 175   2   2]
 [ 65 134   7   2]
 [ 18  78   8   1]]
<class 'numpy.ndarray'>
(6, 4)
 
Task 79:

Pandas rows to map
In [83]:
 
print('Task 79:')
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)  
employees_list2 = map(list, employees.values)
print(employees_list2)
print(type(employees_list2))
Task 79:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
<map object at 0x7f94aaa24d50>
<class 'map'>
 
Task 80:

Pandas rows to map
In [84]:
 
print('Task 80:')
employees = pd.DataFrame({
    "age": [17, 50, 24, 45, 65, 18], 
    "salary": [75, 33, 90, 175, 134, 78],
    "grade" : [7, 8, 9, 2, 7, 8],
    "group" : [1, 1, 2, 2, 2, 1] 
    
})  
print(employees)  
employees_list2 = list(map(list, employees.values))
print(employees_list2)
print(type(employees_list2))
Task 80:
   age  salary  grade  group
0   17      75      7      1
1   50      33      8      1
2   24      90      9      2
3   45     175      2      2
4   65     134      7      2
5   18      78      8      1
[[17, 75, 7, 1], [50, 33, 8, 1], [24, 90, 9, 2], [45, 175, 2, 2], [65, 134, 7, 2], [18, 78, 8, 1]]
<class 'list'>
 
Task 81:

Drop duplicates
In [85]:
 
print('Task 81:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)
users.drop_duplicates('id', inplace=True, keep='last')
print(users)
Task 81:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
   id      city  count
1   1  Montreal      8
3   2  Montreal      2
5   3    Ottawa      8
 
Task 82:

Selecting multiple columns
In [86]:
 
print('Task 82:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)
users1 = users[['id', 'city']]
print(users1)
Task 82:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
   id      city
0   1   Toronto
1   1  Montreal
2   2   Calgary
3   2  Montreal
4   3  Montreal
5   3    Ottawa
 
Task 83:

Selecting multiple columns
In [87]:
 
print('Task 83:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
}) 
print(users)
columns = ['id', 'count']
users1 = pd.DataFrame(users, columns=columns)
print(users1)
Task 83:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
   id  count
0   1      7
1   1      8
2   2      9
3   2      2
4   3      7
5   3      8
 
Task 84:

Row and Column Slicing
In [88]:
 
print('Task 84:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)    
users1 = users.iloc[0:2, 1:3]
print(users1)
Task 84:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
       city  count
0   Toronto      7
1  Montreal      8
 
Task 85:

Iterating rows
In [89]:
 
print('Task 85:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)    
for index, row in users.iterrows():
    print(row['city'], "==>", row['count'])
Task 85:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
Toronto ==> 7
Montreal ==> 8
Calgary ==> 9
Montreal ==> 2
Montreal ==> 7
Ottawa ==> 8
 
Task 86:

Iterating tuples
In [90]:
 
print('Task 86:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)    
for row in users.itertuples(index=True, name='Pandas'):
    print(getattr(row, 'city'))
    
for row in users.itertuples(index=True, name='Pandas'):
    print(row.count)
Task 86:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
Toronto
Montreal
Calgary
Montreal
Montreal
Ottawa
7
8
9
2
7
8
 
Task 87:

Iterating rows and columns
In [91]:
 
print('Task 87:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)    
for i, row in users.iterrows():
    for j, col in row.iteritems():    
        print(col)
Task 87:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
1
Toronto
7
1
Montreal
8
2
Calgary
9
2
Montreal
2
3
Montreal
7
3
Ottawa
8
 
Task 88:

List of Dictionary to Dataframe
In [92]:
 
print('Task 88:')  
pointlist = [
                {'points': 50, 'time': '5:00', 'year': 2010}, 
                {'points': 25, 'time': '6:00', 'month': "february"}, 
                {'points':90, 'time': '9:00', 'month': 'january'}, 
                {'points_h1':20, 'month': 'june'}
            ]
print(pointlist)
pointDf = pd.DataFrame(pointlist)
print(pointDf)

pointDf1 = pd.DataFrame.from_dict(pointlist)
print(pointDf1)
Task 88:
[{'points': 50, 'time': '5:00', 'year': 2010}, {'points': 25, 'time': '6:00', 'month': 'february'}, {'points': 90, 'time': '9:00', 'month': 'january'}, {'points_h1': 20, 'month': 'june'}]
   points  time    year     month  points_h1
0    50.0  5:00  2010.0       NaN        NaN
1    25.0  6:00     NaN  february        NaN
2    90.0  9:00     NaN   january        NaN
3     NaN   NaN     NaN      june       20.0
   points  time    year     month  points_h1
0    50.0  5:00  2010.0       NaN        NaN
1    25.0  6:00     NaN  february        NaN
2    90.0  9:00     NaN   january        NaN
3     NaN   NaN     NaN      june       20.0
 
Task 89:

NaN values
In [93]:
 
print('Task 89:')
df = pd.DataFrame(np.random.randn(10,6))
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[5,3] = np.nan
df.iloc[7:9,5] = np.nan
print(df)
df1 = df.isnull()
print(df1)
Task 89:
          0         1         2         3         4         5
0  0.540913 -0.553456  0.453133 -1.511952  0.577551  0.055984
1  0.407596       NaN -1.208357  0.570245 -1.023995 -1.219163
2 -0.233536       NaN  0.091172 -1.342335  0.739268 -0.366752
3  0.060941  0.469757  1.298251 -1.141865 -0.815346  0.022657
4 -0.199977 -0.214600 -1.271661 -1.015698 -0.441930  0.427154
5 -0.288945 -1.905407  0.478499       NaN  1.023978  1.139541
6  0.388830 -0.223013 -0.019079 -0.260426  0.653921  0.535028
7  0.297773 -1.145989  0.864355  0.784948 -0.385051       NaN
8 -0.542621  0.708869 -0.962973 -0.652508 -1.173369       NaN
9  1.336793 -0.725214  0.826753  0.789312 -0.129610  0.523339
       0      1      2      3      4      5
0  False  False  False  False  False  False
1  False   True  False  False  False  False
2  False   True  False  False  False  False
3  False  False  False  False  False  False
4  False  False  False  False  False  False
5  False  False  False   True  False  False
6  False  False  False  False  False  False
7  False  False  False  False  False   True
8  False  False  False  False  False   True
9  False  False  False  False  False  False
 
Task 90:

Sum of all nan
In [94]:
 
print('Task 90:')  
df = pd.DataFrame(np.random.randn(10,6))
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[5,3] = np.nan
df.iloc[7:9,5] = np.nan
print(df)
print(df.isnull().sum())
print(df.isnull().sum(axis=1))
print(df.isnull().sum().tolist())
Task 90:
          0         1         2         3         4         5
0  1.141499 -0.291840 -0.062988 -1.108261 -0.482589 -0.272195
1 -0.122856       NaN -0.710558  0.335315  2.170243  0.533203
2 -0.276681       NaN -0.557979 -1.362011  0.256766  0.516673
3 -0.752554  0.033607  0.796717 -0.500306 -0.757649 -1.794851
4  0.680711 -0.826902 -1.076855 -0.454343 -0.954347 -2.478088
5 -1.539196 -2.027720  1.223407       NaN -0.955970 -0.658511
6  0.116921 -0.819247  1.342155  1.520891  1.898623  1.038872
7  0.145614  0.557035  2.574063  2.220653  1.612106       NaN
8 -0.334625  0.193173 -1.215978 -0.991431  1.074652       NaN
9 -0.257326 -0.206077  0.082228 -1.216702 -0.991528 -2.072902
0    0
1    2
2    0
3    1
4    0
5    2
dtype: int64
0    0
1    1
2    1
3    0
4    0
5    1
6    0
7    1
8    1
9    0
dtype: int64
[0, 2, 0, 1, 0, 2]
 
Task 91:

Sum of all nan rowwise
In [95]:
 
print('Task 91:')  
df = pd.DataFrame(np.random.randn(10,6))
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[5,3] = np.nan
df.iloc[7:9,5] = np.nan
print(df)
print(df.isnull().sum(axis=1))
Task 91:
          0         1         2         3         4         5
0  0.855893  0.001594 -0.658182  0.469672 -0.345445  0.085155
1 -0.358459       NaN -0.448836  1.241837 -1.387113 -0.755060
2 -0.022478       NaN  1.222022 -0.986843 -1.234605  1.329473
3  0.797484 -0.668570  0.362744 -0.518738 -1.082427  0.154328
4  2.405966 -1.305672 -0.216044 -0.522908 -0.934953  0.604987
5  1.431157  0.052408  0.187788       NaN  0.799391  1.499596
6  0.004238  0.872001 -0.715903 -0.665963 -1.551638  0.513720
7  0.198346  1.553118  1.323276  1.235759  1.548998       NaN
8  1.068935  1.106076 -0.013700  0.081799  0.409688       NaN
9  0.404310  0.066820  0.017141  0.358042 -0.298570  1.061888
0    0
1    1
2    1
3    0
4    0
5    1
6    0
7    1
8    1
9    0
dtype: int64
 
Task 92:

Sum of all nan as list
In [96]:
 
print('Task 92:')  
df = pd.DataFrame(np.random.randn(10,6))
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[5,3] = np.nan
df.iloc[7:9,5] = np.nan
print(df)
print(df.isnull().sum().tolist())
Task 92:
          0         1         2         3         4         5
0 -1.297375  0.697703  0.421182 -1.030176 -0.013431  0.159176
1  0.968752       NaN -0.470342  0.550176 -0.588441 -0.563879
2  0.558115       NaN -1.393654 -0.213277  0.906528  0.697442
3  0.533468 -0.252609 -0.426854  1.814782 -0.086337 -0.484522
4  0.894330 -0.829854  2.034939  1.671108  1.059560  0.307216
5 -0.783491 -0.669807  0.226527       NaN -1.978117 -0.177288
6 -0.673208  0.299807 -1.546269 -0.145253  0.218424 -0.823518
7  0.613591 -1.438878 -0.821973 -1.059776 -0.399523       NaN
8  2.141051  0.428400 -0.634827 -0.445698 -2.118319       NaN
9 -0.709473 -1.331460  0.350693 -0.878138 -0.759564 -1.946125
[0, 2, 0, 1, 0, 2]
 
Task 93:

Change the order of columns Note: FutureWarning: '.reindex_axis' is deprecated and will be removed in a future version
In [97]:
 
print('Task 93:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)

# below line throws error
# users1 = users.reindex_axis(['city', 'count', 'id'], axis=1)
# print(users1)

users2 = users.reindex(columns=['city', 'id', 'count'])
print(users2)
Task 93:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
       city  id  count
0   Toronto   1      7
1  Montreal   1      8
2   Calgary   2      9
3  Montreal   2      2
4  Montreal   3      7
5    Ottawa   3      8
 
Task 94:

Drop multiple rows
In [98]:
 
print('Task 94:')
numbers = pd.DataFrame({
    "id": [1, 2, 3, 4, 5, 6], 
    "number": [10, 20, 30, 30, 23, 12]
    
})  
print(numbers)
numbers.drop(numbers.index[[0, 3, 5]], inplace=True)
print(numbers)
Task 94:
   id  number
0   1      10
1   2      20
2   3      30
3   4      30
4   5      23
5   6      12
   id  number
1   2      20
2   3      30
4   5      23
 
Task 95:

Drop multiple rows by row name
In [99]:
 
print('Task 95:')  
numbers = pd.DataFrame({
    "id": [1, 2, 3, 4, 5, 6], 
    "number": [10, 20, 30, 30, 23, 12]
    
}, index=['one', 'two', 'three', 'four', 'five', 'six'])  
print(numbers)
numbers1 = numbers.drop(['two','six'])
print(numbers1)
numbers2 = numbers.drop('two')
print(numbers2)
Task 95:
       id  number
one     1      10
two     2      20
three   3      30
four    4      30
five    5      23
six     6      12
       id  number
one     1      10
three   3      30
four    4      30
five    5      23
       id  number
one     1      10
three   3      30
four    4      30
five    5      23
six     6      12
 
Task :

Get group
In [100]:
 
print('Task 96:')
cats = animals_df.groupby(['animal']).get_group('cat')
print(cats)
Task 96:
  animal size  weight  adult
0    cat    S       8  False
2    cat    M      11  False
5    cat    L      12   True
6    cat    L      12   True
 
 
Task 97:

Get the the odd row
In [101]:
 
print('Task 97:')  
x = numpy.array([
                    [ 1,  2,  3,  4,  5],
                    [ 6,  7,  8,  9, 10],
                    [11, 12, 13, 14, 15],
                    [16, 17, 18, 19, 20]
                ]
    )
print(x)
print(x[::2])
Task 97:
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
[[ 1  2  3  4  5]
 [11 12 13 14 15]]
 
Task 98:

Get the even columns
In [102]:
 
print('Task 98:')  
x = numpy.array([
                    [ 1,  2,  3,  4,  5],
                    [ 6,  7,  8,  9, 10],
                    [11, 12, 13, 14, 15],
                    [16, 17, 18, 19, 20]
                ]
    )
print(x)
print(x[:, 1::2])
Task 98:
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
[[ 2  4]
 [ 7  9]
 [12 14]
 [17 19]]
 
Task 99:

Odd rows and even columns
In [103]:
 
print('Task 99:')  

x = numpy.array([
                    [ 1,  2,  3,  4,  5],
                    [ 6,  7,  8,  9, 10],
                    [11, 12, 13, 14, 15],
                    [16, 17, 18, 19, 20]
                ]
    )
print(x)
print(x[::2, 1::2])
Task 99:
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
[[ 2  4]
 [12 14]]
 
Task 100:

Drop duplicates
In [104]:
 
print('Task 100:')  
users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)
users.drop_duplicates('id', inplace=True)
print(users)
Task 100:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8
   id      city  count
0   1   Toronto      7
2   2   Calgary      9
4   3  Montreal      7
 
Task 101:

Drop all duplicates
In [105]:
 
print('Task 101:')  
users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 
print(users)
users.drop_duplicates('name', inplace=True, keep='last')
print(users)
users1 = users.drop_duplicates('name', keep=False)
print(users1)
Task 101:
    name      city  count
0  kevin   Toronto      7
1  james  Montreal      8
2  kumar   Calgary      9
3  kevin  Montreal      2
4  kevin  Montreal      7
5  james    Ottawa      8
    name      city  count
2  kumar   Calgary      9
4  kevin  Montreal      7
5  james    Ottawa      8
    name      city  count
2  kumar   Calgary      9
4  kevin  Montreal      7
5  james    Ottawa      8
 
Task 102:

Basic group by
In [106]:
 
print('Task 102:')
animals_df1 = animals_df.groupby('animal').apply(lambda x: x['size'][x['weight'].idxmax()])
print(animals_df1)
Task 102:
animal
cat     L
dog     M
fish    M
dtype: object
 
Task 103:

Missing Data: Make A'th 3rd coulmn Nan
In [107]:
 
print('Task 103:')  
df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A'))
print(df)
df.loc[df.index[3], 'A'] = np.nan
print(df)
Task 103:
                   A
2013-08-01  0.204227
2013-08-02  0.414079
2013-08-05 -0.715691
2013-08-06  0.845370
2013-08-07  1.029887
2013-08-08 -0.057697
                   A
2013-08-01  0.204227
2013-08-02  0.414079
2013-08-05 -0.715691
2013-08-06       NaN
2013-08-07  1.029887
2013-08-08 -0.057697
 
Task 104:

reindex
In [108]:
 
print('Task 104:')
df1 = df.reindex(df.index[::-1]).ffill()
print(df1)
Task 104:
                   A
2013-08-08 -0.057697
2013-08-07  1.029887
2013-08-06  1.029887
2013-08-05 -0.715691
2013-08-02  0.414079
2013-08-01  0.204227
 
Task 105:

Column reset Nan
In [109]:
 
print('Task 105:')
animals_df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),
                   'size': list('SSMMMLL'),
                   'weight': [8, 10, 11, 1, 20, 12, 12],
                   'adult' : [False] * 5 + [True] * 2})
print(animals_df)
Task 105:
  animal size  weight  adult
0    cat    S       8  False
1    dog    S      10  False
2    cat    M      11  False
3   fish    M       1  False
4    dog    M      20  False
5    cat    L      12   True
6    cat    L      12   True
 

Extras

 
Task 106:

Change columns
In [110]:
 
# 106. Change columns
print('Task 106:')

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa']
}) 

print('Before changing columns : ')
print(users)

# change columns
users_new = users.rename({'name': 'first_name', 'city': 'current_city'}, axis = 1)

print('\nAfter changing columns : ')
print(users_new)
Task 106:
Before changing columns : 
    name      city
0  kevin   Toronto
1  james  Montreal
2  kumar   Calgary
3  kevin  Montreal
4  kevin  Montreal
5  james    Ottawa

After changing columns : 
  first_name current_city
0      kevin      Toronto
1      james     Montreal
2      kumar      Calgary
3      kevin     Montreal
4      kevin     Montreal
5      james       Ottawa
 
Task 107:

Match with isin function
In [111]:
 
# 107. Match with isin function

print('Task 107:')

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa']
}) 

print('Original Dataframe:')
print(users)

print('\nFinding `Montreal` in by using isin function :')

users.isin(['Montreal']).any()
Task 107:
Original Dataframe:
    name      city
0  kevin   Toronto
1  james  Montreal
2  kumar   Calgary
3  kevin  Montreal
4  kevin  Montreal
5  james    Ottawa

Finding `Montreal` in by using isin function :
Out[111]:
name    False
city     True
dtype: bool
 
Task 108:

Finding specific items by using `isin` function
In [112]:
 
# 108. Finding specific items by using `isin` function

print('Task 108:')

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa']
}) 

print('Original Dataframe: ')
print(users)

print('\nFinding `Montreal` in using isin and stack them: ')
print(users[users.isin(['Montreal'])].stack())
Task 108:
Original Dataframe: 
    name      city
0  kevin   Toronto
1  james  Montreal
2  kumar   Calgary
3  kevin  Montreal
4  kevin  Montreal
5  james    Ottawa

Finding `Montreal` in using isin and stack them: 
1  city    Montreal
3  city    Montreal
4  city    Montreal
dtype: object
 
Task 109:

Exclude specific matching
In [113]:
 
# 109. Exclude specific matching

print('Task 109:')

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa']
}) 

print('Original Dataframe: ')
print(users)

print('\nExcluding `Montreal` in using isin and stack them: ')
print(users[~users.isin(['Montreal'])].stack())
Task 109:
Original Dataframe: 
    name      city
0  kevin   Toronto
1  james  Montreal
2  kumar   Calgary
3  kevin  Montreal
4  kevin  Montreal
5  james    Ottawa

Excluding `Montreal` in using isin and stack them: 
0  name      kevin
   city    Toronto
1  name      james
2  name      kumar
   city    Calgary
3  name      kevin
4  name      kevin
5  name      james
   city     Ottawa
dtype: object
 
Task 110:

Apply a custom function on multiple columns
In [114]:
 
# 110. Apply a custom function on multiple columns

print('Task 110:')

amounts = pd.DataFrame({
    "CIBC": [200, 4200, 300, 300], 
    "TD": [1200, 800, 4000, 2000]
})

print('Original Dataframe: ')
print(amounts)

def get_total_amount(x):
    
    # if the amount is less than 500, skip it
    total_amount = 0
    
    if(x['CIBC'] > 499):
        total_amount += x['CIBC']
        
    if(x['TD'] > 499):
        total_amount += x['TD']
    
    return total_amount

amounts['Total'] = amounts.apply(get_total_amount, axis = 1)

print('Dataframe after applying the custom function: ')
print(amounts)
Task 110:
Original Dataframe: 
   CIBC    TD
0   200  1200
1  4200   800
2   300  4000
3   300  2000
Dataframe after applying the custom function: 
   CIBC    TD  Total
0   200  1200   1200
1  4200   800   5000
2   300  4000   4000
3   300  2000   2000
 
Task 111:

iterrows as tuples
In [115]:
 
# 111. iterrows as tuples

print('Task 111:')

users = pd.DataFrame({
    "id": [1, 1, 2, 2, 3, 3], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 

print(users)  

print('\nIterate rows as tuples:')
for row in users.itertuples():
    print(row)
Task 111:
   id      city  count
0   1   Toronto      7
1   1  Montreal      8
2   2   Calgary      9
3   2  Montreal      2
4   3  Montreal      7
5   3    Ottawa      8

Iterate rows as tuples:
Pandas(Index=0, id=1, city='Toronto', count=7)
Pandas(Index=1, id=1, city='Montreal', count=8)
Pandas(Index=2, id=2, city='Calgary', count=9)
Pandas(Index=3, id=2, city='Montreal', count=2)
Pandas(Index=4, id=3, city='Montreal', count=7)
Pandas(Index=5, id=3, city='Ottawa', count=8)
 
Task 112:

Dataframe with NaN
In [116]:
 
# 112. Dataframe with NaN

print('Task 112:')

df = pd.DataFrame(np.nan, index = [0, 1, 2], columns = ['A', 'B', 'C'])

print(df)
Task 112:
    A   B   C
0 NaN NaN NaN
1 NaN NaN NaN
2 NaN NaN NaN
 
Task 113:

Simple Dataframe with NaN
In [117]:
 
# 113. Simple Dataframe with NaN

print('Task 113:')

df = pd.DataFrame([np.nan] * 5)

print(df)
Task 113:
    0
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
 
Task 114:

Pandas and Date with Range
In [118]:
 
# 114. Pandas and Date with Range

print('Task 114:')

import datetime as dt

pd.np.random.seed(0)

df = pd.DataFrame({
     "date" : [dt.date(2012, x, 1) for x in range(1, 11)]
})

print(df)
Task 114:
         date
0  2012-01-01
1  2012-02-01
2  2012-03-01
3  2012-04-01
4  2012-05-01
5  2012-06-01
6  2012-07-01
7  2012-08-01
8  2012-09-01
9  2012-10-01
 
Task 115:

Pandas and Numpy Repeat
In [119]:
 
# 115. Pandas and Numpy Repeat

print('Task 115:')

df = pd.DataFrame({
     "entry" : np.repeat(3, 7) # Repeat the number for 7 times
})

print(df)
Task 115:
   entry
0      3
1      3
2      3
3      3
4      3
5      3
6      3
 
Task 116:

read_sql in Pandas
In [120]:
 
# 116. read_sql in Pandas

print('Task 116:')

import sqlite3 as sql

conn = sql.connect('/kaggle/input/rj-sample-datasets/sample.db')

demo_df = pd.read_sql('SELECT ID, NAME FROM DEMO', conn)

print(demo_df)

# Note: online sqlite https://sqliteonline.com/
Task 116:
    ID                Name
0    1          SQL Online
1    2               Chart
2    3          Short CODE
3    4       SqLite 3.33.0
4    5        MultiVersion
5    6  [RightClick] mouse
6    7   Left-Panel, Table
7    8                Tabs
8    9          SQL Editor
9   10          Size table
10  11               Share
11  12              ai.Url
12  13            ai.Color
13  14            ai.Image
14  15                 SQL
15  16              CREATE
16  17              SELECT
17  18              INSERT
18  19              UPDATE
19  20              DELETE
 
Task 117:

Get a single value by iat
In [121]:
 
# 117. Get a single value by iat

print('Task 117:')

df = pd.DataFrame(np.random.randint(0, 100, size = (7, 2)), columns = list('ab'))

print('\nOriginal Dataframe:')
print(df)

val = df.iat[3, 1]

print('\nGetting value at 3rd row and first col:')
print(val)
Task 117:

Original Dataframe:
    a   b
0  44  47
1  64  67
2  67   9
3  83  21
4  36  87
5  70  88
6  88  12

Getting value at 3rd row and first col:
21
 
Task 118:

Get last n elements
In [122]:
 
# 118. Get last n elements

print('Task 118:')

df = pd.DataFrame(np.random.randint(0, 100, size = (7, 2)), columns = list('ab'))

print('\nOriginal Dataframe:')
print(df)

tail_df = df.tail(3)

print('\nLast 3 rows:')
print(tail_df)
Task 118:

Original Dataframe:
    a   b
0  58  65
1  39  87
2  46  88
3  81  37
4  25  77
5  72   9
6  20  80

Last 3 rows:
    a   b
4  25  77
5  72   9
6  20  80
 
Task 119:

Add data one by one by using loc function
In [123]:
 
# 119. Add data one by one by using loc function

df = pd.DataFrame(columns = ['rows', 'cols', 'time taken'])

df.loc['Dataset1 - Dask'] = (1000, 20, 10)
df.loc['Dataset1 - Pandas'] = (1000, 20, 15)

df
Out[123]:
rowscolstime takenDataset1 - DaskDataset1 - Pandas
1000 20 10
1000 20 15
In [124]:
 
!pip install country_converter
Collecting country_converter
  Downloading country_converter-0.7.1.tar.gz (47 kB)
     |████████████████████████████████| 47 kB 665 kB/s 
Requirement already satisfied: pandas>=0.17.0 in /opt/conda/lib/python3.7/site-packages (from country_converter) (1.1.1)
Requirement already satisfied: numpy>=1.15.4 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.17.0->country_converter) (1.18.5)
Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.17.0->country_converter) (2019.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas>=0.17.0->country_converter) (2.8.1)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->country_converter) (1.14.0)
Building wheels for collected packages: country-converter
  Building wheel for country-converter (setup.py) ... - \ done
  Created wheel for country-converter: filename=country_converter-0.7.1-py3-none-any.whl size=50806 sha256=7d8182dafabadbde8e8e2f6f5c9502e65a49baa99704818939deaeead23d5224
  Stored in directory: /root/.cache/pip/wheels/38/bd/4c/973e11855305bf1c4a35f63737b4e55ab574dfb0a7ca8a0698
Successfully built country-converter
Installing collected packages: country-converter
Successfully installed country-converter-0.7.1
WARNING: You are using pip version 20.2.2; however, version 20.2.4 is available.
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.
 
Task 120:

Convert country in Dataframe
In [125]:
 
# 120. Convert country in Dataframe 

import country_converter as coco
import pandas as pd

df = pd.DataFrame({'code': ['IN', 'USA', 'BR', 'CAN']})

print('Before:')
print(df)

df['country'] = df.code.apply(lambda x: coco.convert(names = x, to = 'name_short', not_found = None))

print('\nAfter:')
print(df)
Before:
  code
0   IN
1  USA
2   BR
3  CAN

After:
  code        country
0   IN          India
1  USA  United States
2   BR         Brazil
3  CAN         Canada
 
Task 121:

Group by multiple columns
In [126]:
 
# 121. Group by multiple columns

df = pd.read_csv('../input/datasets-for-pandas/uk-500.csv')

df.groupby(['county', 'first_name']).sum().sample(5)
Out[126]:
last_namecompany_nameaddresscitypostalphone1phone2emailwebcountyfirst_nameCity of EdinburghJanetCumbriaElroyWest YorkshireCarlineNorth AyrshireAletaBerkshireSean
Steimer Alaska State Troopers 8899 Mere Lane Midlothian West Ward EH20 9SJ 01506-627051 01698-817116 janet_steimer@hotmail.com http://www.alaskastatetroopers.co.uk
Piehler Jonson, Geo D Esq 821 Pembroke Place Kirkby Stephen CA17 4RZ 01589-654152 01586-387018 epiehler@piehler.co.uk http://www.jonsongeodesq.co.uk
Bulcao Brenner, Steven G Esq 7 Hey Green Rd #4 Rothwell Ward LS26 8XE 01885-707109 01381-330004 carline@yahoo.com http://www.brennerstevengesq.co.uk
Ligons Advantage Bookkeeping 4545 Victoria Rd Kilbirnie and Beith Ward KA15 1AT 01952-535977 01715-637895 aleta_ligons@hotmail.com http://www.advantagebookkeeping.co.uk
Schurman Cooper Electric Supply Co 2877 Grain St Caversham Ward RG4 5DE 01773-139510 01423-385669 sean.schurman@schurman.co.uk http://www.cooperelectricsupplyco.co.uk
 
Task 122:

Compare two dataframes
In [127]:
 
# 122. Compare two dataframes

df1 = pd.DataFrame({
    'a' : [1,2,3], 
    'b' : [9,8,7]
})

df2 = pd.DataFrame({
    'a' : [1,1,3], 
    'b' : [12,8,12]
})

print(df1.compare(df2))
     a          b      
  self other self other
0  NaN   NaN  9.0  12.0
1  2.0   1.0  NaN   NaN
2  NaN   NaN  7.0  12.0
 
Task 123:

Difference between datetime by only seconds or dates
In [128]:
 
# 123. Difference between datetime by only seconds or dates

df1 = pd.to_datetime('2020-10-14 14:52:21')
df2 = pd.to_datetime('2020-10-18 15:58:10')

delta = (df2 - df1)

print('days   : ', delta.days)
print('seconds: ', delta.seconds)
days   :  4
seconds:  3949
 
Task 124:

Get dates and delta days in Dataframe
In [129]:
 
# 124. Get dates and delta days in Dataframe

entries_count = 6

date_series = pd.Series(pd.date_range('2020-09-01', periods = entries_count, freq = 'D'))

df = pd.DataFrame({'entry_date': date_series})

print(df)
  entry_date
0 2020-09-01
1 2020-09-02
2 2020-09-03
3 2020-09-04
4 2020-09-05
5 2020-09-06
 
Task 125:

Get dates and delta days in Dataframe
In [130]:
 
# 125. Get dates and delta days in Dataframe

entries_count = 5

date_series = pd.Series(pd.date_range('2020-09-01', periods = entries_count, freq = 'D'))

time_delta_series = pd.Series([pd.Timedelta(days = i) for i in range(entries_count)])

df = pd.DataFrame({'entry_date': date_series, 'delta_days': time_delta_series})

print(df)
  entry_date delta_days
0 2020-09-01     0 days
1 2020-09-02     1 days
2 2020-09-03     2 days
3 2020-09-04     3 days
4 2020-09-05     4 days
 
Task 126:

Date addition with time delta
In [131]:
 
# 126. Date addition with time delta

entries_count = 3

date_series = pd.Series(pd.date_range('2020-09-01', periods = entries_count, freq = 'D'))

time_delta_series = pd.Series([pd.Timedelta(days = i) for i in range(entries_count)])

df = pd.DataFrame({'entry_date': date_series, 'delta_days': time_delta_series})

print('\nBefore:')
print(df)


df['new_entry_date'] = df['entry_date'] + df['delta_days']
print('\nAfter:')
print(df)

print('\nData types:')
print(df.dtypes)
Before:
  entry_date delta_days
0 2020-09-01     0 days
1 2020-09-02     1 days
2 2020-09-03     2 days

After:
  entry_date delta_days new_entry_date
0 2020-09-01     0 days     2020-09-01
1 2020-09-02     1 days     2020-09-03
2 2020-09-03     2 days     2020-09-05

Data types:
entry_date         datetime64[ns]
delta_days        timedelta64[ns]
new_entry_date     datetime64[ns]
dtype: object
 
Task 127:

Converting Pandas innner list to numpy array without the word `list`
In [132]:
 
# 127. Converting Pandas innner list to numpy array without the word `list`

s = pd.Series([[1, 2, 3], np.array([1, 2, 3])])
    
print('Original Series:')
print(s)  

arr = np.array(s)
print('\nSeries to Array:')
print(arr)

print('\nAfter converting inner elements to list:')
x = np.array(s[:].tolist())
print(x)

# Source: https://stackoverflow.com/questions/64089218/how-to-convert-data-frame-pandas-into-list-in-numpy-without-word-list-in-array
Original Series:
0    [1, 2, 3]
1    [1, 2, 3]
dtype: object

Series to Array:
[list([1, 2, 3]) array([1, 2, 3])]

After converting inner elements to list:
[[1 2 3]
 [1 2 3]]
 
Task 128:

Append series to existing Dataframe
In [133]:
 
# 128. Append series to existing Dataframe

df = pd.DataFrame([1, 3, 4, 5], columns=["id"])
df.set_index("id")
print('Before:')
print(df)

marks = pd.Series([79, 20, 45, 99])
df["marks"] = marks
print('\nAfter:')
print(df)
Before:
   id
0   1
1   3
2   4
3   5

After:
   id  marks
0   1     79
1   3     20
2   4     45
3   5     99
 
Task 129:

Cumulative sum with group by with multiple values
In [134]:
 
# 129. Cumulative sum with group by with multiple values

df = pd.DataFrame({
    'City': ['Toronto', 'Toronto', 'Toronto', 'Montreal', 'Montreal', 'Montreal', 'New York', 'Toronto', 'New York', 'New York'], 
    'Year': [2020, 2020, 2018, 2018, 2018, 2012, 2012, 2012, 2016, 2018], 
    'Points': [1000, 2000, 3000, 3000, 2000, 1500, 1500, 500, 500, 300]
})

print('Before:')
print(df)

df.sort_values(['City', 'Year'], inplace=True)
print('\nAfter Sorting:')
print(df)

df['Points_cum'] = df.groupby(['City', 'Year'])['Points'].cumsum()
print('\nAfter Cumulative Sum:')
print(df)
Before:
       City  Year  Points
0   Toronto  2020    1000
1   Toronto  2020    2000
2   Toronto  2018    3000
3  Montreal  2018    3000
4  Montreal  2018    2000
5  Montreal  2012    1500
6  New York  2012    1500
7   Toronto  2012     500
8  New York  2016     500
9  New York  2018     300

After Sorting:
       City  Year  Points
5  Montreal  2012    1500
3  Montreal  2018    3000
4  Montreal  2018    2000
6  New York  2012    1500
8  New York  2016     500
9  New York  2018     300
7   Toronto  2012     500
2   Toronto  2018    3000
0   Toronto  2020    1000
1   Toronto  2020    2000

After Cumulative Sum:
       City  Year  Points  Points_cum
5  Montreal  2012    1500        1500
3  Montreal  2018    3000        3000
4  Montreal  2018    2000        5000
6  New York  2012    1500        1500
8  New York  2016     500         500
9  New York  2018     300         300
7   Toronto  2012     500         500
2   Toronto  2018    3000        3000
0   Toronto  2020    1000        1000
1   Toronto  2020    2000        3000
 
Task 130:

Clean URL in Pandas String (remove string after specific item)
In [135]:
 
# 130. Clean URL in Pandas String (remove string after specific item)

df = pd.DataFrame({
    'url': ['https://www.abc.com/page/2/unit234', 'https://www.abc.com/page/3/unit34', 'https://www.abc.com/page/4/unit89']
})

print('Original:')
print(df)

df['url'] = df['url'].str.replace('\/unit.*', '/', regex=True)
print('\nAfer removing unit and future chars:')
print(df)
Original:
                                  url
0  https://www.abc.com/page/2/unit234
1   https://www.abc.com/page/3/unit34
2   https://www.abc.com/page/4/unit89

Afer removing unit and future chars:
                           url
0  https://www.abc.com/page/2/
1  https://www.abc.com/page/3/
2  https://www.abc.com/page/4/
 
Task 131:

Find any matching value
In [136]:
 
# 131. Find any matching value

print('Task 131:')

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa']
}) 

print('Original Dataframe:')
print(users)

print('\nFinding `Montreal` in any cell :')
print(users[users.eq('Montreal').any(1)])
Task 131:
Original Dataframe:
    name      city
0  kevin   Toronto
1  james  Montreal
2  kumar   Calgary
3  kevin  Montreal
4  kevin  Montreal
5  james    Ottawa

Finding `Montreal` in any cell :
    name      city
1  james  Montreal
3  kevin  Montreal
4  kevin  Montreal
In [137]:
 
# 132. Add prefix on column lables

import pandas as pd
import numpy as np

np.random.seed(34)

df = pd.DataFrame(np.random.rand(30, 5)).add_prefix("feature_")
df["year"] = np.repeat(["2012", "2013", "2014"], 10)

print(df.head())
print(len(df))
   feature_0  feature_1  feature_2  feature_3  feature_4  year
0   0.038562   0.780100   0.092704   0.632893   0.013891  2012
1   0.936159   0.426574   0.253086   0.189965   0.414825  2012
2   0.013421   0.648473   0.869369   0.609036   0.807993  2012
3   0.826511   0.271683   0.836476   0.282541   0.138309  2012
4   0.714720   0.472537   0.556588   0.385576   0.941147  2012
30
In [138]:
 
# 133. Set values on where condition

df = pd.DataFrame({'city': ['Toronto', 'Toronto', 'Montreal', 'Montreal', 'Montreal'], 'Year': [2017, 2018, 2019, 2020, 2021]})

print('Original;')
print(df)

df['airport'] = None
df['airport'] = np.where(df['city'] == 'Toronto', {'airport_count': 2}, df['airport'])

print('\nAfter:')
print(df)
Original;
       city  Year
0   Toronto  2017
1   Toronto  2018
2  Montreal  2019
3  Montreal  2020
4  Montreal  2021

After:
       city  Year               airport
0   Toronto  2017  {'airport_count': 2}
1   Toronto  2018  {'airport_count': 2}
2  Montreal  2019                  None
3  Montreal  2020                  None
4  Montreal  2021                  None
In [139]:
 
# 134. Flatten Dictionary to Dataframe

d = {
    "country" : {
        "canada": {
            "code": 'CA',
            "city": {
                "name": "Toronto",
                "population": "6M"
            }
        }
    }
}

df = pd.json_normalize(d)

print(df)
  country.canada.code country.canada.city.name country.canada.city.population
0                  CA                  Toronto                             6M
In [140]:
 
# 135. Using transform

df = pd.DataFrame({
        'ID': [1, 2, 2, 3, 3, 4, 4],
        'Value': [10, 5, 5, 10, 4, 11, 12]
    })

df['Count'] = df.groupby('ID')['Value'].transform('count')
print(df)
   ID  Value  Count
0   1     10      1
1   2      5      2
2   2      5      2
3   3     10      2
4   3      4      2
5   4     11      2
6   4     12      2
In [141]:
 
# 136. Summing all elements

df = pd.DataFrame({
        'ID': [1, 2, 2, 3, 3, 4, 4],
        'Value': [10, 5, 5, 10, 4, 11, 12]
    })

sum1 = df.stack().sum()
print(sum1)

sum2 = df.to_numpy().sum()
print(sum2)
76
76
In [142]:
 
# 137. Combine multiple CSV files by using Dask

import dask.dataframe as dd

df = dd.read_csv('/kaggle/input/rj-sample-datasets/transaction-*.csv')

df = df.compute() 

df.columns = ['Date', 'Description', 'Spent']

df.head(2)
Out[142]:
DateDescriptionSpent01
08/04/2020 GOOGLE *GSUITE_ _V 7.8
08/04/2020 GOOGLE *GSUITE_ _V 15.6
In [143]:
 
# 138. Separate Elements into Bins

df = pd.DataFrame({'number': np.random.randint(1, 100, 5)}) 
print('Original:')
print(df)

df['bins'] = pd.cut(x = df['number'], bins = [1, 20, 40, 60, 80, 100]) 
print('\nAfter pandas cut:')
print(df)
Original:
   number
0      80
1      81
2      38
3      83
4      47

After pandas cut:
   number       bins
0      80   (60, 80]
1      81  (80, 100]
2      38   (20, 40]
3      83  (80, 100]
4      47   (40, 60]
In [144]:
 
# 139. Encode the object as enumerated or categorical

codes, uniques = pd.factorize(['TO', 'NY', 'CA', 'MNT', 'CA', 'NY'], sort = True)

print(codes)
print(uniques)
[3 2 0 1 0 2]
['CA' 'MNT' 'NY' 'TO']
In [145]:
 
# 140. 


df = pd.read_csv("/kaggle/input/rj-sample-datasets/sample-transaction.csv", names = ['date', 'description', 'spent'])

# df

bins = np.linspace(min(df['spent']), max(df['spent']), 4)
# print(bins)
groupNames = ["low", "med", "high"]

df['spentGroup'] = pd.cut(df['spent'], bins, labels = groupNames, include_lowest = True)
print(df)
           date           description   spent spentGroup
0    08/01/2019  SSV TO:  11686447909    6.00        low
1    08/01/2019  GOOGLE *GSUITE_   _V   21.06        low
2    08/01/2019  PYT TO:  19764085822   50.00        low
3    08/01/2019  PYT TO:  11683291109   53.72        low
4    08/01/2019  SEND E-TFR CA***YGw   700.00        low
..          ...                   ...     ...        ...
110  08/30/2019  AROMA ESPRESSO    _F    2.68        low
111  08/30/2019  BAKERY ON THE G   _F   18.00        low
112  08/30/2019  SSV TO:  11686447909    6.00        low
113  08/30/2019  TD MUTUAL FUNDS       125.00        low
114  08/30/2019  MONTHLY ACCOUNT FEE    15.95        low

[115 rows x 4 columns]
In [146]:
 
# 141. Simple result table with Pandas

results = pd.DataFrame(columns = ['Accuracy'])

results.loc['Random Classifier'] = 98.893
results.loc['KNN'] = 99.343

results
Out[146]:
AccuracyRandom ClassifierKNN
98.893
99.343
In [147]:
 
# 142. isin with multiple conditions

df = pd.DataFrame([
        ['LEhOc7XSE0','2020', '03', 'car'],
        ['LEhOc7XSE0','2020', '03', 'truck'],
        ['LEhOc7XSE0','2020', '03', 'bike'],
        ['LEhOc7XSE0','2020', '03', 'insurance'],
        ['LEhOc7XSE0','2020', '03', 'inspection'],
        ['iXC5AfJMox','2020', '04', 'car'],
        ['iXC5AfJMox','2020', '04', 'truck'],
        ['iXC5AfJMox','2020', '04', 'inspection'],
        ['XpLLAySojz','2020', '01', 'bike'],
    ], columns=['order_id','year', 'month', 'item_type'])
print('Original:')
print(df)

df['bike_or_car'] = df['item_type'].isin({'bike', 'car'})
print('\nAfter:')
print(df)

# source: https://stackoverflow.com/questions/64248872/python-pandas-count-columns-with-multiple-conditions-and-groupby
Original:
     order_id  year month   item_type
0  LEhOc7XSE0  2020    03         car
1  LEhOc7XSE0  2020    03       truck
2  LEhOc7XSE0  2020    03        bike
3  LEhOc7XSE0  2020    03   insurance
4  LEhOc7XSE0  2020    03  inspection
5  iXC5AfJMox  2020    04         car
6  iXC5AfJMox  2020    04       truck
7  iXC5AfJMox  2020    04  inspection
8  XpLLAySojz  2020    01        bike

After:
     order_id  year month   item_type  bike_or_car
0  LEhOc7XSE0  2020    03         car         True
1  LEhOc7XSE0  2020    03       truck        False
2  LEhOc7XSE0  2020    03        bike         True
3  LEhOc7XSE0  2020    03   insurance        False
4  LEhOc7XSE0  2020    03  inspection        False
5  iXC5AfJMox  2020    04         car         True
6  iXC5AfJMox  2020    04       truck        False
7  iXC5AfJMox  2020    04  inspection        False
8  XpLLAySojz  2020    01        bike         True
In [148]:
 
# 143. String to Pandas with python engine and separator

from io import StringIO
import pandas as pd

data = '''            student       science   maths  history
0     David       90          100  89
20    Kevin       98          97  90
'''
df = pd.read_csv(StringIO(data), sep='\s\s+', engine='python')

print(df)

# source: https://stackoverflow.com/questions/64268344/pandas-consolidate-two-rows-in-one
   student  science  maths  history
0    David       90    100       89
20   Kevin       98     97       90
In [149]:
 
# 144.

def get_unique_items(df):
    
    # how many cols are there
    cols = df.columns
    cols_count = len(df.columns)
    # print(cols_count)

    unique_max_count = 0
    for _col in cols:
#         print(_col)
        current_col_unique_values = list(df[_col].unique())
        list_len = len(current_col_unique_values) 
        
        if(list_len > unique_max_count):
            unique_max_count = list_len
    
    # list of each_cols and unique values
    col_unique_values = {}
    
    for _col in cols:
#         print(_col)
        current_col_unique_values = list(df[_col].unique())
        list_len = len(current_col_unique_values) 
        
        fill_count = 0
        if(list_len < unique_max_count):
            fill_count = unique_max_count - list_len
            
            for _ in range(fill_count):
                current_col_unique_values.append('-')
        
        col_unique_values[_col] = current_col_unique_values
        
    df_new = pd.DataFrame(col_unique_values)
    
    return df_new

users = pd.DataFrame({
    "name": ['kevin', 'james', 'kumar', 'kevin', 'kevin', 'james'], 
    "city": ['Toronto', 'Montreal', 'Calgary', 'Montreal', 'Montreal', 'Ottawa'],
    "count" : [7, 8, 9, 2, 7, 8] 
    
}) 

unique_user_items = get_unique_items(users)

unique_user_items
Out[149]:
namecitycount0123
kevin Toronto 7
james Montreal 8
kumar Calgary 9
- Ottawa 2
linkcode

To do:

  • Clean up some buggy exercises
  • Remove duplicate exercises
  • Add more documentation
 
 
반응형

댓글