Edwin Ruiz
Hello! Glad to see you here. This is a collection of my personal projects made during free time.
In [1]:
import requests
from bs4 import BeautifulSoup
In [2]:
#
url = "https://www.worldometers.info/coronavirus/"
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
In [4]:
table=soup.find('table',attrs={"id":"main_table_countries_today"})
headers=table.find_all('th')
columns=[]
for header in headers:
columns.append(header.get_text().strip().replace('\n','').replace('\xa0',' ').replace(' ',''))
In [16]:
rows=soup.find_all('tr')
data=[]
for x in rows:
data.append([dato.get_text(strip=True).replace('\n','').replace('\xa0',' ').replace(' ','').replace(',','') for dato in x.find_all('td')])
In [23]:
import pandas as pd
import numpy as np
# Ignoring the first rows and last 8
df=pd.DataFrame(data=data[9:-8:],columns=columns)
In [24]:
df.head().append(df.tail(5))
Out[24]:
| # | Country,Other | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | ... | TotalTests | Tests/1Mpop | Population | Continent | 1CaseeveryXppl | 1DeatheveryXppl | 1TesteveryXppl | NewCases/1Mpop | NewDeaths/1Mpop | ActiveCases/1Mpop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | USA | 108862996 | 1178436 | 106686109 | 998451 | 1605 | ... | 1186346810 | 3543393 | 334805269 | NorthAmerica | 3 | 284 | 0 | 2982 | |||||
| 1 | 2 | India | 44999366 | 531930 | 44466968 | 468 | N/A | ... | 930797975 | 661721 | 1406631776 | Asia | 31 | 2644 | 2 | 0.3 | |||||
| 2 | 3 | France | 40138560 | 167642 | 39970918 | 0 | 869 | ... | 271490188 | 4139547 | 65584518 | Europe | 2 | 391 | 0 | ||||||
| 3 | 4 | Germany | 38504530 | 176200 | 38240600 | 87730 | N/A | ... | 122332384 | 1458359 | 83883596 | Europe | 2 | 476 | 1 | 1046 | |||||
| 4 | 5 | Brazil | 37827912 | 705962 | 36249161 | 872789 | N/A | ... | 63776166 | 296146 | 215353593 | SouthAmerica | 6 | 305 | 3 | 4053 | |||||
| 722 | 227 | DiamondPrincess | 712 | 13 | 699 | 0 | ... | ||||||||||||||
| 723 | 228 | Tokelau | 80 | 80 | ... | 1378 | Australia/Oceania | 17 | 58055 | ||||||||||||
| 724 | 229 | VaticanCity | 29 | 29 | 0 | ... | 799 | Europe | 28 | ||||||||||||
| 725 | 230 | WesternSahara | 10 | 1 | 9 | 0 | ... | 626161 | Africa | 62616 | 626161 | ||||||||||
| 726 | 231 | MSZaandam | 9 | 2 | 7 | 0 | ... |
10 rows × 22 columns
In [25]:
#The object columns have to be changed to Float and the empty rows have to be filled with
# NaN values as well
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 727 entries, 0 to 726 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 # 725 non-null object 1 Country,Other 725 non-null object 2 TotalCases 725 non-null object 3 NewCases 725 non-null object 4 TotalDeaths 725 non-null object 5 NewDeaths 725 non-null object 6 TotalRecovered 725 non-null object 7 NewRecovered 725 non-null object 8 ActiveCases 725 non-null object 9 Serious,Critical 725 non-null object 10 TotCases/1Mpop 725 non-null object 11 Deaths/1Mpop 725 non-null object 12 TotalTests 725 non-null object 13 Tests/1Mpop 725 non-null object 14 Population 725 non-null object 15 Continent 725 non-null object 16 1CaseeveryXppl 725 non-null object 17 1DeatheveryXppl 725 non-null object 18 1TesteveryXppl 725 non-null object 19 NewCases/1Mpop 725 non-null object 20 NewDeaths/1Mpop 725 non-null object 21 ActiveCases/1Mpop 725 non-null object dtypes: object(22) memory usage: 125.1+ KB
In [26]:
#The DataFrame has many N/A string values that need to be change to NaN values
df=df.apply(lambda x :x.replace('',np.nan))
df=df.apply(lambda x:x.replace('N/A',np.nan))
In [27]:
#Now we can change the object columns to Floats
df.head().append(df.tail(5))
Out[27]:
| # | Country,Other | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | ... | TotalTests | Tests/1Mpop | Population | Continent | 1CaseeveryXppl | 1DeatheveryXppl | 1TesteveryXppl | NewCases/1Mpop | NewDeaths/1Mpop | ActiveCases/1Mpop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | USA | 108862996 | NaN | 1178436 | NaN | 106686109 | NaN | 998451 | 1605 | ... | 1186346810 | 3543393 | 334805269 | NorthAmerica | 3 | 284 | 0 | NaN | NaN | 2982 |
| 1 | 2 | India | 44999366 | NaN | 531930 | NaN | 44466968 | NaN | 468 | NaN | ... | 930797975 | 661721 | 1406631776 | Asia | 31 | 2644 | 2 | NaN | NaN | 0.3 |
| 2 | 3 | France | 40138560 | NaN | 167642 | NaN | 39970918 | NaN | 0 | 869 | ... | 271490188 | 4139547 | 65584518 | Europe | 2 | 391 | 0 | NaN | NaN | NaN |
| 3 | 4 | Germany | 38504530 | NaN | 176200 | NaN | 38240600 | NaN | 87730 | NaN | ... | 122332384 | 1458359 | 83883596 | Europe | 2 | 476 | 1 | NaN | NaN | 1046 |
| 4 | 5 | Brazil | 37827912 | NaN | 705962 | NaN | 36249161 | NaN | 872789 | NaN | ... | 63776166 | 296146 | 215353593 | SouthAmerica | 6 | 305 | 3 | NaN | NaN | 4053 |
| 722 | 227 | DiamondPrincess | 712 | NaN | 13 | NaN | 699 | NaN | 0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 723 | 228 | Tokelau | 80 | NaN | NaN | NaN | NaN | NaN | 80 | NaN | ... | NaN | NaN | 1378 | Australia/Oceania | 17 | NaN | NaN | NaN | NaN | 58055 |
| 724 | 229 | VaticanCity | 29 | NaN | NaN | NaN | 29 | NaN | 0 | NaN | ... | NaN | NaN | 799 | Europe | 28 | NaN | NaN | NaN | NaN | NaN |
| 725 | 230 | WesternSahara | 10 | NaN | 1 | NaN | 9 | NaN | 0 | NaN | ... | NaN | NaN | 626161 | Africa | 62616 | 626161 | NaN | NaN | NaN | NaN |
| 726 | 231 | MSZaandam | 9 | NaN | 2 | NaN | 7 | NaN | 0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 22 columns
In [30]:
column_names =[ 'TotalCases', 'NewCases', 'TotalDeaths',
'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
'Serious,Critical', 'TotCases/1Mpop', 'Deaths/1Mpop', 'TotalTests',
'Tests/1Mpop', 'Population','1CaseeveryXppl',
'1DeatheveryXppl', '1TesteveryXppl', 'NewCases/1Mpop',
'NewDeaths/1Mpop', 'ActiveCases/1Mpop']
df[column_names]=df[column_names].astype(float)
In [32]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 727 entries, 0 to 726 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 # 693 non-null object 1 Country,Other 723 non-null object 2 TotalCases 725 non-null float64 3 NewCases 51 non-null float64 4 TotalDeaths 710 non-null float64 5 NewDeaths 31 non-null float64 6 TotalRecovered 659 non-null float64 7 NewRecovered 60 non-null float64 8 ActiveCases 665 non-null float64 9 Serious,Critical 383 non-null float64 10 TotCases/1Mpop 691 non-null float64 11 Deaths/1Mpop 676 non-null float64 12 TotalTests 639 non-null float64 13 Tests/1Mpop 639 non-null float64 14 Population 687 non-null float64 15 Continent 715 non-null object 16 1CaseeveryXppl 687 non-null float64 17 1DeatheveryXppl 672 non-null float64 18 1TesteveryXppl 639 non-null float64 19 NewCases/1Mpop 33 non-null float64 20 NewDeaths/1Mpop 15 non-null float64 21 ActiveCases/1Mpop 615 non-null float64 dtypes: float64(19), object(3) memory usage: 125.1+ KB
In [34]:
df.describe()
Out[34]:
| TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | TotCases/1Mpop | Deaths/1Mpop | TotalTests | Tests/1Mpop | Population | 1CaseeveryXppl | 1DeatheveryXppl | 1TesteveryXppl | NewCases/1Mpop | NewDeaths/1Mpop | ActiveCases/1Mpop | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.250000e+02 | 51.000000 | 7.100000e+02 | 31.000000 | 6.590000e+02 | 60.000000 | 6.650000e+02 | 383.000000 | 691.000000 | 676.000000 | 6.390000e+02 | 6.390000e+02 | 6.870000e+02 | 687.000000 | 672.000000 | 639.000000 | 33.000000 | 15.000000 | 615.000000 |
| mean | 1.056595e+07 | 1640.431373 | 1.072859e+05 | 8.129032 | 1.097041e+07 | 4068.166667 | 2.874906e+05 | 893.506527 | 199277.767728 | 1262.851479 | 3.289847e+07 | 2.141892e+06 | 3.469404e+07 | 401.882096 | 14429.181548 | 10.694836 | 35.841515 | 0.458667 | 32067.316260 |
| std | 5.834581e+07 | 4145.412709 | 5.788394e+05 | 8.961927 | 5.866719e+07 | 7355.976487 | 2.001352e+06 | 4359.903147 | 200951.816462 | 1294.042739 | 1.194338e+08 | 3.634674e+06 | 1.384352e+08 | 4141.170312 | 57011.083639 | 26.699756 | 48.010156 | 0.532244 | 95867.961404 |
| min | 9.000000e+00 | 2.000000 | 1.000000e+00 | 1.000000 | 2.000000e+00 | 1.000000 | 0.000000e+00 | 0.000000 | 16.000000 | 2.000000 | 7.850000e+03 | 5.093000e+03 | 7.990000e+02 | 1.000000 | 152.000000 | 0.000000 | 0.030000 | 0.010000 | 0.200000 |
| 25% | 2.654700e+04 | 38.000000 | 2.250000e+02 | 3.000000 | 2.310200e+04 | 75.500000 | 6.400000e+01 | 4.000000 | 18885.000000 | 176.000000 | 3.478150e+05 | 1.981990e+05 | 4.454310e+05 | 3.000000 | 478.750000 | 0.000000 | 0.400000 | 0.090000 | 70.000000 |
| 50% | 2.303540e+05 | 191.000000 | 2.628000e+03 | 4.000000 | 2.599530e+05 | 421.000000 | 9.710000e+02 | 14.000000 | 124062.000000 | 869.000000 | 2.226216e+06 | 8.851190e+05 | 5.797805e+06 | 8.000000 | 1181.500000 | 1.000000 | 10.000000 | 0.300000 | 549.000000 |
| 75% | 1.721838e+06 | 743.000000 | 1.884325e+04 | 9.500000 | 1.998448e+06 | 3187.000000 | 1.053900e+04 | 59.000000 | 325150.000000 | 2087.000000 | 1.312845e+07 | 2.439107e+06 | 2.210284e+07 | 53.000000 | 5800.750000 | 5.000000 | 64.000000 | 0.700000 | 3339.000000 |
| max | 6.964063e+08 | 18250.000000 | 6.924844e+06 | 28.000000 | 6.683491e+08 | 24858.000000 | 2.115834e+07 | 38127.000000 | 724512.000000 | 6595.000000 | 1.186347e+09 | 2.330212e+07 | 1.448471e+09 | 62616.000000 | 626161.000000 | 196.000000 | 163.000000 | 2.000000 | 612553.000000 |
Error
You are trying to load a table of an unknown type. Probably you did not activate the addon which is required to use this table type.
You are trying to load a table of an unknown type. Probably you did not activate the addon which is required to use this table type.