Download Location: https://www.kaggle.com/bappekim/air-pollution-in-seoul
import numpy as np
import pandas as pd
from scipy import stats
This function reformats the CSV data by splitting the Measurement date column into 2 columns (Date and Time) and the Address column into 4 columns (Country, City, District, Street Address).
def reformat_CSV_Data(airData):
# Splitting the Measurement Date column into two columns;
# one with just the date and the other with just the time
date = []
time = []
for line in airData['Measurement date']:
row = []
for val in line.split(' '):
row += [val.strip()]
date += [row[0]]
time += [row[1]]
airData['Measurement_Date'] = date
airData['Measurement_Time'] = time
# Splitting the Address column into 4 columns:
# Country, City, District, and Street_Address
country = []
city = []
district = []
streetAddress = []
for line in airData['Address']:
row = []
for val in line.split(','):
row += [val.strip()]
country += [row[4]]
city += [row[3]]
district += [row[2]]
streetAddress += [row[0] + ' ' + row[1]]
airData['Country'] = country
airData['City'] = city
airData['District'] = district
airData['Street_Address'] = streetAddress
# Dropping the Measurement data and Address columns from the dataframe
airData.drop(['Address', 'Measurement date'], axis=1, inplace=True)
# Rename station code column (to Station_Code) for easier parsing later on
airData = airData.rename(columns={'Station code': 'Station_Code'})
# Rearranging the column order for easier reading
airData = airData[['Measurement_Date', 'Measurement_Time', 'Station_Code',
'Country', 'City', 'District', 'Street_Address', 'Latitude',
'Longitude', 'SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']]
return airData
This function applies a filter to the reformatted data to remove any outliers that might skew the data. The filter checks to make sure that the data in the data columns (SO2, NO2, O3, PM10, and PM2.5) is within -3 and +3 standard deviations away from the mean for that column.
def filterData(data):
# The columns in the excel sheet that hold numeric data
dataCols = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']
# This applies a filter to all the data columns of the dataframe:
# * For each column, it first computes the Z-score of each value
# in the column relative to the column mean and standard deviation.
# * If the score is not within -3 and +3 standard deviations away from the mean for that
# column, then the record is filtered out of the dataframe (thus removing the outliers)
filteredData = data[(np.abs(stats.zscore(data[dataCols])) < 3).all(axis=1)]
# This filter removes any data that is less than zero because
# the measurement of pollutants in the air cannot go below zero
filteredData = filteredData[(filteredData[dataCols] >= 0).all(axis=1)]
print('Total number of rows BEFORE data is removed: {:,}\n Total number of rows AFTER data is removed: {:,}\n'
'====================================================\n\t Total number of rows removed: {:>7,}'
.format(len(data.index), len(filteredData.index), len(data.index) - len(filteredData.index)))
return filteredData
This function calculates the AQI (Air Quality Index) value, typically calculated from PM2.5 and determines its risk level. The AQI formula, value ranges and risk levels were all taken from the EPA (Environmental Protection Agency) of the USA.
def calculate_AQI(airData):
airData = airData.reset_index()
aqiValues = []
riskLevel = []
for i in range(len(airData)):
i_low = 0
i_high = 0
c_low = 0
c_high = 0
# PM2.5 AQI Value Calculation
if 0 <= airData['PM2.5'][i] <= 12:
c_low = 0
c_high = 12
i_low = 0
i_high = 50
elif 12.1 <= airData['PM2.5'][i] <= 35.4:
c_low = 12.1
c_high = 35.4
i_low = 51
i_high = 100
elif 35.5 <= airData['PM2.5'][i] <= 55.4:
c_low = 35.5
c_high = 55.4
i_low = 101
i_high = 150
elif 55.5 <= airData['PM2.5'][i] <= 150.4:
c_low = 55.5
c_high = 150.4
i_low = 151
i_high = 200
elif 150.5 <= airData['PM2.5'][i] <= 250.4:
c_low = 150.5
c_high = 250.4
i_low = 201
i_high = 300
elif 250.5 <= airData['PM2.5'][i] <= 350.4:
c_low = 250.5
c_high = 350.4
i_low = 301
i_high = 400
elif 350.5 <= airData['PM2.5'][i] <= 500.4:
c_low = 350.5
c_high = 500.4
i_low = 401
i_high = 500
# AQI Formula
aqiValues += [int(round(((i_high - i_low) / (c_high - c_low)) *
(airData['PM2.5'][i] - c_low) + i_low, 0))]
# Determine AQI Risk Level
if 0 <= aqiValues[i] <= 50:
riskLevel += ['Good']
elif 51 <= aqiValues[i] <= 100:
riskLevel += ['Moderate']
elif 101 <= aqiValues[i] <= 150:
riskLevel += ['Unhealthy for Sensitive Groups']
elif 151 <= aqiValues[i] <= 200:
riskLevel += ['Unhealthy']
elif 201 <= aqiValues[i] <= 300:
riskLevel += ['Very Unhealthy']
elif 301 <= aqiValues[i] <= 500:
riskLevel += ['Hazardous']
# Add the AQI values to the data frame
airData['AQI_(PM2.5)'] = aqiValues
airData['AQI_Risk_Level'] = riskLevel
# Remove index column that was added after the dataframe was reindexed
airData.drop(['index'], axis=1, inplace=True)
return airData
# Read csv Air pollution data
airData = pd.read_csv('Air_Data/Measurement_summary.csv')
print('Initial size of data: {:,} columns and {:,} rows'.format(airData.shape[1], airData.shape[0]))
print('Old CSV Columns: ' + ', '.join(list(airData.keys())))
airData.head()
# Reformat data for easier data parsing
reformed_airData = reformat_CSV_Data(airData)
print('New CSV Columns: ' + ', '.join(list(reformed_airData.keys())))
reformed_airData.head()
# Remove outliers and invalid values from the data
filteredData = filterData(reformed_airData)
airData = calculate_AQI(filteredData)
# Two new columns added: AQI_(PM2.5) and AQI_Risk_Level
print('New columns: ' + ', '.join(list(airData.keys())))
airData.head()
print('Final size of data: {:,} columns and {:,} rows'.format(airData.shape[1], airData.shape[0]))
airData.to_csv('Reformed_Data/CSV-Air_Pollution_Data-(Reformed_and_AQI_Values).csv', index=None)