## Data Collection via Webscraping from SpaceX Wikipedia

In [3]:
# Importing necessary libraries

import sys
from bs4 import BeautifulSoup
import re
import unicodedata
import requests
import pandas as pd

In [4]:
# Function(s) to parse web scraped HTML table

def dt(table_cells):
 return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def bv(table_cells):
 out=''.join([booster_version for i, booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
 return out 

def lnds(table_cells):
 out=[i for i in table_cells.strings][0]
 return out

def g_ms(table_cells):
 mass=unicodedata.normalize("NFKD", table_cells.text).strip()
 if mass:
 mass.find("kg")
 new_mass=mass[0:mass.find("kg")+2]
 else:
 new_mass=0
 return new_mass

def exColhd(row):
 if (row.br):
 row.br.extract()
 if row.a:
 row.a.extract()
 if row.sup:
 row.sup.extract()
 
 column_name = ' '.join(row.contents)
 
 if not(column_name.strip().isdigit()):
 column_name = column_name.strip()
 return column_name

In [5]:
# Requesting Falcon9 Launch Wiki page from URL 

url = 'https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922'
rsp = requests.get(url).text
soup = BeautifulSoup(rsp, 'html.parser')
soup.title

List of Falcon 9 and Falcon Heavy launches - Wikipedia

In [6]:
# Getting all Columns/Variables from the HTML table header

html_tables = soup.find_all('table')

# Extracting from the third table onward
t_targ = html_tables[2]
column_names = []
t_head = t_targ.find_all('th')

for th in t_head:
 column_name = exColhd(th)
 if column_name is not None and len(column_name) > 0:
 column_names.append(column_name)
 
 
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [7]:
# Creating a dataframe by parsing the HTML tables

dict2 = dict.fromkeys(column_names)

del dict2['Date and time ( )']

# Initializing the dictionary to have each value be an empty list

dict2['Flight No.'] = []
dict2['Launch site'] = []
dict2['Payload'] = []
dict2['Payload mass'] = []
dict2['Orbit'] = []
dict2['Customer'] = []
dict2['Launch outcome'] = []
dict2['Version Booster']=[]
dict2['Booster landing']=[]
dict2['Date']=[]
dict2['Time']=[]

# Parsing the HTML table to fill our dictionary 

extracted_row = 0

for table_number, table in enumerate(soup.find_all('table',"wikitable plainrowheaders collapsible")):
 for rows in table.find_all("tr"):
 if rows.th:
 if rows.th.string:
 flight_number = rows.th.string.strip()
 flag = flight_number.isdigit()
 else:
 flag=False
 row=rows.find_all("td")
 if flag:
 extracted_row += 1
 
 dict2['Flight No.'].append(flight_number)
 print(flight_number)
 datatimelist=dt(row[0])
 
 date = datatimelist[0].strip(',')
 dict2['Date'].append(date)
 print(date)
 
 time = datatimelist[1]
 dict2['Time'].append(time)
 print(time)
 
 bsv = bv(row[1])
 dict2['Version Booster'].append(bsv)
 print(bsv)
 
 launch_site = row[2].a.string
 dict2['Launch site'].append(launch_site)
 print(launch_site)
 
 payload = row[3].a.string
 dict2['Payload'].append(payload)
 print(payload)
 
 payload_mass = g_ms(row[4])
 dict2['Payload mass'].append(payload_mass)
 print(payload)
 
 orbit = row[5].a.string
 dict2['Orbit'].append(orbit)
 print(orbit)
 
 customer = row[6].a.string
 dict2['Customer'].append(customer)
 print(customer)
 
 launch_outcome = list(row[7].strings)[0]
 dict2['Launch outcome'].append(launch_outcome)
 print(launch_outcome)
 
 booster_landing = lnds(row[8])
 dict2['Booster landing'].append(booster_landing)
 print(booster_landing)

1
4 June 2010
18:45
F9 v1.0B0003.1
CCAFS
Dragon Spacecraft Qualification Unit
Dragon Spacecraft Qualification Unit
LEO
SpaceX
Success

Failure
2
8 December 2010
15:43
F9 v1.0B0004.1
CCAFS
Dragon
Dragon
LEO
NASA
Success
Failure
3
22 May 2012
07:44
F9 v1.0B0005.1
CCAFS
Dragon
Dragon
LEO
NASA
Success
Not attempted

4
8 October 2012
00:35
F9 v1.0B0006.1
CCAFS
SpaceX CRS-1
SpaceX CRS-1
LEO
NASA
Success

No attempt
5
1 March 2013
15:10
F9 v1.0B0007.1
CCAFS
SpaceX CRS-2
SpaceX CRS-2
LEO
NASA
Success

Not attempted

6
29 September 2013
16:00
F9 v1.1B1003
VAFB
CASSIOPE
CASSIOPE
Polar orbit
MDA
Success
Uncontrolled
7
3 December 2013
22:41

CCAFS
SES-8
SES-8
GTO
SES
Success
Not attempted
8
6 January 2014
22:06

CCAFS
Thaicom 6
Thaicom 6
GTO
Thaicom
Success
Not attempted
9
18 April 2014
19:25

Cape Canaveral
SpaceX CRS-3
SpaceX CRS-3
LEO
NASA
Success

Controlled
10
14 July 2014
15:15

Cape Canaveral
Orbcomm-OG2
Orbcomm-OG2
LEO
Orbcomm
Success
Controlled
11
5 August 2014
08:00

Cape Canaveral
AsiaS

AttributeError: 'NoneType' object has no attribute 'string'

In [9]:
df2= pd.DataFrame({ key:pd.Series(value) for key, value in dict2.items() })

In [10]:
# Exporting our dataframe to CSV

df2.to_csv('dataset_p2_Webscraping.csv', index=False)