[Data Analysis] Topic 2: Pandas

Authored by Tony Feng

Created on April 30th, 2022

Last Modified on May 1st, 2022

Intro

Pandas is great data processing library for table/database-like data. Excellent for things that come in or you wish to output as CSV/Excel. DataFrames make manipulating your data easy, from selecting or replacing columns and indices to reshaping your data.


Data Preprocessing

Dataframe Creating

1
2
3
4
5
data = {
    "Country": ['China', 'US', 'India'],
    "Population": [14, 3, 14]
}
df = pd.DataFrame(data)

Data Reading

1
2
3
4
5
6
7
8
9
import pandas as pd
df = pd.read_csv('data.csv')
df.head()       # First 5 rows
df.tail()       # Last 5 rows
df.info()       # Overview of the data
df.describe()   # Statistical details
df.columns      # header
df.dtype        # Types of each column
df.values       # Data matrix

Data indexing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
df_age = df['Age']      # Get a column
age = df_age.values     # Get a ndarray 
df_birthday = df[['Year', 'Month', 'Day']]      # Get two columns

df.iloc[0]          # First piece of data
df.iloc[0:5]        # First 5 rows
df.iloc[0:5, 1:3]   # First 5 rows with specified features

df = df.set_index('Name')       # Reset index
df.loc['Tony']                  
df.loc['Tony', 'Age']           # Use name to find age

df[df['Age'] >= 18][:5]                      # Find 5 people whose age is over 18
df.loc[df['Sex']=='Female', 'Age'].mean()    # Calculate the average age of all females

Series

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Create
val = [1, 10 ,100]
id = ['a', 'b', 'c']
s = pd.Series(data=val, index=id)

# Indexing
s.loc['b']      # 10
s.iloc[1]       # 10

# Assignment
s1 = s.copy()
s1['a'] = 0
s1.rename(index={'a':'alpha'}, inplace=True)
s1.index                # Index(['alpha', 'b', 'c'], dtype='object')

# Add
s2 = pd.Series(data=[1000, 10000], index=['e', 'f'])
s3 = s1.append(s2)
s3['d'] = 500           # Add an index with a value

# Remove
del s3['alpha']
s3.drop(['b' ,'c'], inplace=True)  

Data Analysis

Statistics

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
df = pd.DataFrame([[1,2,3],[4,5,6]], index=['a','b'], columns=['A','B','C'])
df.sum()        # axis=0 by default
df.mean()
df.median()
df.max()
df.min()

df.describe()   # Statistical details: cnt, mean, var, max, min, etc.
df.cov()        # Covariance
df.corr()       # Correlation coefficient

df['Sex'].value_counts(ascending=True)          # Female 314 Male 577
df['Age'].value_counts(ascending=True, bins=5)  # All age values will be separated in to 5 ranges.

ages = [15, 18, 20, 21, 26, 33, 45, 66, 75]
bins = [10, 30, 50, 80]                 # (10, 30], (30, 50], (50, 80] Three intervals
names = ["Y", "M", "O"]
res = pd.cut(ages, bins, labels=names)  # ['Y', 'Y', 'Y', 'Y', 'Y', 'M', 'M', 'O', 'O']
                                        # Categories (3, object): ['Y' < 'M' < 'O']

Pivot Table

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Pivot
df = pd.DataFrame({
    "Type": ['Breakfast', 'Lunch', 'Dinner', 'Breakfast', 'Lunch', 'Dinner', 'Breakfast', 'Lunch', 'Dinner'],
    "Month": [1, 1, 1, 2, 2, 2, 3, 3, 3],
    "Cost": [50, 60, 100, 30, 70, 90, 40, 40, 40]
})
pivot = df.pivot(index="Month", columns="Type", values="Cost")
'''
Type   Breakfast  Dinner  Lunch
Month                          
1             50     100     60
2             30      90     70
3             40      40     40
'''
pivot.sum(axis=0) 
'''
Type
Breakfast    120
Dinner       230
Lunch        170
dtype: int64
'''

# Pivot Table
df = pd.DataFrame({
    "A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],
    "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
    "C": ["small", "large", "large", "small", "small", "large", "small", "small", "large"],
    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]
})
pTable = df.pivot_table(index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum, fill_value=-1)
'''
C        large  small
A   B                
bar one      4      5
    two      7      6
foo one      2      1
    two     -1      3
'''

Group

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
df = pd.DataFrame({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Hadoop","Spark","Python","R"],
    'Fee' :[22000,25000,23000,24000,26000,25000,25000,22000,1500],
    'Duration':[30,50,55,40,60,35,30,50,40],
    'Discount':[1000,2300,1000,1200,2500,None,1400,1600,0]
})
# Group by multiple columns
df2 = df.groupby(['Courses', 'Duration']).sum()
'''
                    Fee  Discount
Courses Duration                 
Hadoop  35        25000       0.0
        55        23000    1000.0
Pandas  60        26000    2500.0
PySpark 50        25000    2300.0
Python  40        24000    1200.0
        50        22000    1600.0
R       40         1500       0.0
Spark   30        47000    2400.0
'''
# Choose whether to sort
df.groupby('Courses',sort=True).sum() # not choose ascending ot descending
'''
           Fee  Duration  Discount
Courses                           
Hadoop   48000        90    1000.0
Pandas   26000        60    2500.0
PySpark  25000        50    2300.0
Python   46000        90    2800.0
R         1500        40       0.0
Spark    47000        60    2400.0
'''
# Sort group key on descending order
groupedDF = df.groupby('Courses',sort=False).sum()
groupedDF.sort_values('Courses', ascending=False)
'''
Spark    47000        60    2400.0
R         1500        40       0.0
Python   46000        90    2800.0
PySpark  25000        50    2300.0
Pandas   26000        60    2500.0
Hadoop   48000        90    1000.0
'''
# Drop NA and add an index
df3 = df.groupby(['Courses'], dropna=False).sum().reset_index()
'''
   Courses    Fee  Duration  Discount
0   Hadoop  48000        90    1000.0
1   Pandas  26000        60    2500.0
2  PySpark  25000        50    2300.0
3   Python  46000        90    2800.0
4        R   1500        40       0.0
5    Spark  47000        60    2400.0
'''
# Apply aggregations
df.groupby('Courses').aggregate({'Duration':'count','Fee':['min','max']})
'''
        Duration    Fee       
           count    min    max
Courses                       
Hadoop         2  23000  25000
Pandas         1  26000  26000
PySpark        1  25000  25000
Python         2  22000  24000
R              1   1500   1500
Spark          2  22000  25000
'''

Usuful Functions

Merge

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
df1 = pd.DataFrame({
    'name': ['Foreign Cinema', 'Liho Liho', '500 Club', 'The Square', 'S86'],
    'Type': ['Restaurant', 'Restaurant', 'bar', 'bar', 'bar']

})

df2 = pd.DataFrame({
    'name': ['Foreign Cinema', 'Liho Liho', '500 Club', 'The Square'],
    'AvgBill': [100, 200, 300, 400],
    'Rating': [5, 4.2, 4.3, 4.8]
})

pd.merge(
    df1, df2, 
    on = 'name', 
    how = 'outer',      # Otherwise, 'S86' will be deleted.
    indicator = True    # _merge
)

'''
             name        Type  AvgBill  Rating     _merge
0  Foreign Cinema  Restaurant    100.0     5.0       both
1       Liho Liho  Restaurant    200.0     4.2       both
2        500 Club         bar    300.0     4.3       both
3      The Square         bar    400.0     4.8       both
4             S86         bar      NaN     NaN  left_only
'''

Sort

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
df = pd.DataFrame({
    'group': ['a']*3 + ['b']*3 + ['c']*3,
    'data': [4,3,2,1,12,3,4,5,7]
})

df.sort_values(by=['data', 'group'], ascending=[False, True], inplace=True)
'''
  group  data
4     b    12
8     c     7
7     c     5
0     a     4
6     c     4
1     a     3
5     b     3
2     a     2
3     b     1
'''

Data Cleaning

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# Remove duplicates 
df.drop_duplicates() # key-value pairs
df.drop_duplicates(subset='some_feature') # no duplicates in 'some_feature'

# NaN
import numpy as np
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
df.isnull()
df.isnull().any(axis=1) # Check whether rows have NaN
df.fillna(-1)           # Fill NaN
'''
   0    1    2   |          0      1      2    |    0    False     |         0    1    2    
0  0  1.0  2.0   |   0  False  False  False    |    1     True     |      0  0  1.0  2.0    
1  0  NaN  0.0   |   1  False   True  False    |    2     True     |      1  0 -1.0  0.0    
2  0  0.0  NaN   |   2  False  False   True    |    3    False     |      2  0  0.0 -1.0    
3  0  1.0  2.0   |   3  False  False  False    |    dtype: bool    |      3  0  1.0  2.0     
'''

# apply
df['new_col'] = df.apply(some_func, axis='columns') # some_func could be defined somewhere or inline using lambda.

Timestamp

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# Create a timestamp
ts = pd.Timestamp('2022-05-01') # Timestamp('2022-05-01 00:00:00')
ts.month    # 5
ts.day      # 1
ts + pd.Timedelta('5 days')     # Timestamp('2022-05-06 00:00:00')
pd.Series(pd.date_range(start='2022-05-01', periods=5, freq='12H'))
'''
0   2022-05-01 00:00:00
1   2022-05-01 12:00:00
2   2022-05-02 00:00:00
3   2022-05-02 12:00:00
4   2022-05-03 00:00:00
dtype: datetime64[ns]
'''
# Reas csv with timestamps
df = pd.read_csv('./data.csv', index_col=0, parse_dates=True)
df['2022']
df['2022-01':'2022-02']
df.resample('2D').mean().head()     # Two days as a period
df.resample('M').max().head()       # One month as a period

Big Data Techniques

Datatype Conversion

1
2
3
4
5
# Assume we want to decrease the data memory space
df_float = df.select_dtypes(include=['float64'])
converted_float = df_float.apply(pd.to_numeric, downcast='float')
print(mem_usage(df_float))
print(mem_usage(converted_float))

Property Conversion

1
2
3
4
5
6
7
8
# Assuming data contains weekdays, we can set same weekday to share the same memory space.
# 70 weekdays may result in 70 memory space. we convert the 'object' into 'category' for data to share memory space.
df_obj = df.select_dtypes(include=['object'])
dow = df_obj.weekday    # Assume there is a column called 'weekday'
dow_cat = dow.astype('category')
dow_cat.cat.codes       # We can see there are only 7 encoding ways
print(mem_usage(df_float))
print(mem_usage(converted_float))

Reference


MIT License
Last updated on May 02, 2022 05:21 EDT
Built with Hugo
Theme Stack designed by Jimmy