CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

| Download
Views: 30
Image: ubuntu2204
Kernel: Python 3 (system-wide)

Data Collection via Webscraping from SpaceX Wikipedia

# Importing necessary libraries import sys from bs4 import BeautifulSoup import re import unicodedata import requests import pandas as pd
# Function(s) to parse web scraped HTML table def dt(table_cells): return [data_time.strip() for data_time in list(table_cells.strings)][0:2] def bv(table_cells): out=''.join([booster_version for i, booster_version in enumerate( table_cells.strings) if i%2==0][0:-1]) return out def lnds(table_cells): out=[i for i in table_cells.strings][0] return out def g_ms(table_cells): mass=unicodedata.normalize("NFKD", table_cells.text).strip() if mass: mass.find("kg") new_mass=mass[0:mass.find("kg")+2] else: new_mass=0 return new_mass def exColhd(row): if (row.br): row.br.extract() if row.a: row.a.extract() if row.sup: row.sup.extract() column_name = ' '.join(row.contents) if not(column_name.strip().isdigit()): column_name = column_name.strip() return column_name
# Requesting Falcon9 Launch Wiki page from URL url = 'https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922' rsp = requests.get(url).text soup = BeautifulSoup(rsp, 'html.parser') soup.title
<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>
# Getting all Columns/Variables from the HTML table header html_tables = soup.find_all('table') # Extracting from the third table onward t_targ = html_tables[2] column_names = [] t_head = t_targ.find_all('th') for th in t_head: column_name = exColhd(th) if column_name is not None and len(column_name) > 0: column_names.append(column_name) print(column_names)
['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']
# Creating a dataframe by parsing the HTML tables dict2 = dict.fromkeys(column_names) del dict2['Date and time ( )'] # Initializing the dictionary to have each value be an empty list dict2['Flight No.'] = [] dict2['Launch site'] = [] dict2['Payload'] = [] dict2['Payload mass'] = [] dict2['Orbit'] = [] dict2['Customer'] = [] dict2['Launch outcome'] = [] dict2['Version Booster']=[] dict2['Booster landing']=[] dict2['Date']=[] dict2['Time']=[] # Parsing the HTML table to fill our dictionary extracted_row = 0 for table_number, table in enumerate(soup.find_all('table',"wikitable plainrowheaders collapsible")): for rows in table.find_all("tr"): if rows.th: if rows.th.string: flight_number = rows.th.string.strip() flag = flight_number.isdigit() else: flag=False row=rows.find_all("td") if flag: extracted_row += 1 dict2['Flight No.'].append(flight_number) print(flight_number) datatimelist=dt(row[0]) date = datatimelist[0].strip(',') dict2['Date'].append(date) print(date) time = datatimelist[1] dict2['Time'].append(time) print(time) bsv = bv(row[1]) dict2['Version Booster'].append(bsv) print(bsv) launch_site = row[2].a.string dict2['Launch site'].append(launch_site) print(launch_site) payload = row[3].a.string dict2['Payload'].append(payload) print(payload) payload_mass = g_ms(row[4]) dict2['Payload mass'].append(payload_mass) print(payload) orbit = row[5].a.string dict2['Orbit'].append(orbit) print(orbit) customer = row[6].a.string dict2['Customer'].append(customer) print(customer) launch_outcome = list(row[7].strings)[0] dict2['Launch outcome'].append(launch_outcome) print(launch_outcome) booster_landing = lnds(row[8]) dict2['Booster landing'].append(booster_landing) print(booster_landing)
1 4 June 2010 18:45 F9 v1.0B0003.1 CCAFS Dragon Spacecraft Qualification Unit Dragon Spacecraft Qualification Unit LEO SpaceX Success Failure 2 8 December 2010 15:43 F9 v1.0B0004.1 CCAFS Dragon Dragon LEO NASA Success Failure 3 22 May 2012 07:44 F9 v1.0B0005.1 CCAFS Dragon Dragon LEO NASA Success Not attempted 4 8 October 2012 00:35 F9 v1.0B0006.1 CCAFS SpaceX CRS-1 SpaceX CRS-1 LEO NASA Success No attempt 5 1 March 2013 15:10 F9 v1.0B0007.1 CCAFS SpaceX CRS-2 SpaceX CRS-2 LEO NASA Success Not attempted 6 29 September 2013 16:00 F9 v1.1B1003 VAFB CASSIOPE CASSIOPE Polar orbit MDA Success Uncontrolled 7 3 December 2013 22:41 CCAFS SES-8 SES-8 GTO SES Success Not attempted 8 6 January 2014 22:06 CCAFS Thaicom 6 Thaicom 6 GTO Thaicom Success Not attempted 9 18 April 2014 19:25 Cape Canaveral SpaceX CRS-3 SpaceX CRS-3 LEO NASA Success Controlled 10 14 July 2014 15:15 Cape Canaveral Orbcomm-OG2 Orbcomm-OG2 LEO Orbcomm Success Controlled 11 5 August 2014 08:00 Cape Canaveral AsiaSat 8 AsiaSat 8 GTO AsiaSat Success Not attempted 12 7 September 2014 05:00 F9 v1.1 Cape Canaveral AsiaSat 6 AsiaSat 6 GTO AsiaSat Success Not attempted 13 21 September 2014 05:52 F9 v1.1 Cape Canaveral SpaceX CRS-4 SpaceX CRS-4 LEO NASA Success Uncontrolled 14 10 January 2015 09:47 F9 v1.1 Cape Canaveral SpaceX CRS-5 SpaceX CRS-5 LEO NASA Success Failure 15 11 February 2015 23:03 F9 v1.1 Cape Canaveral DSCOVR DSCOVR HEO USAF Success Controlled 16 2 March 2015 03:50 F9 v1.1 Cape Canaveral ABS-3A ABS-3A GTO ABS Success Not attempted 17 14 April 2015 20:10 F9 v1.1 Cape Canaveral SpaceX CRS-6 SpaceX CRS-6 LEO NASA Success Failure 18 27 April 2015 23:03 F9 v1.1 Cape Canaveral TürkmenÄlem 52°E / MonacoSAT TürkmenÄlem 52°E / MonacoSAT GTO None Success Not attempted 19 28 June 2015 14:21 F9 v1.1 Cape Canaveral SpaceX CRS-7 SpaceX CRS-7 LEO NASA Failure Precluded 20 22 December 2015 01:29 F9 FT Cape Canaveral Orbcomm-OG2 Orbcomm-OG2 LEO Orbcomm Success Success 21 17 January 2016 18:42 F9 v1.1 VAFB Jason-3 Jason-3 LEO NASA Success Failure 22 4 March 2016 23:35 F9 FT Cape Canaveral SES-9 SES-9 GTO SES Success Failure 23 8 April 2016 20:43 F9 FT Cape Canaveral SpaceX CRS-8 SpaceX CRS-8 LEO NASA Success Success 24 6 May 2016 05:21 F9 FT Cape Canaveral JCSAT-14 JCSAT-14 GTO SKY Perfect JSAT Group Success Success 25 27 May 2016 21:39 F9 FT Cape Canaveral Thaicom 8 Thaicom 8 GTO Thaicom Success Success 26 15 June 2016 14:29 F9 FT Cape Canaveral ABS-2A ABS-2A GTO ABS Success Failure 27 18 July 2016 04:45 F9 FT Cape Canaveral SpaceX CRS-9 SpaceX CRS-9 LEO NASA Success Success 28 14 August 2016 05:26 F9 FT Cape Canaveral JCSAT-16 JCSAT-16 GTO SKY Perfect JSAT Group Success Success 29 14 January 2017 17:54 F9 FT VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Success 30 19 February 2017 14:39 F9 FT KSC SpaceX CRS-10 SpaceX CRS-10 LEO NASA Success Success 31 16 March 2017 06:00 F9 FT KSC EchoStar 23 EchoStar 23 GTO EchoStar Success Not attempted 32 30 March 2017 22:27 F9 FT♺ KSC SES-10 SES-10 GTO SES Success Success 33 1 May 2017 11:15 F9 FT KSC NROL-76 NROL-76 LEO NRO Success Success 34 15 May 2017 23:21 F9 FT KSC Inmarsat-5 F4 Inmarsat-5 F4 GTO Inmarsat Success Not attempted 35 3 June 2017 21:07 F9 FT KSC SpaceX CRS-11 SpaceX CRS-11 LEO NASA Success Success 36 23 June 2017 19:10 F9 FTB1029.2 KSC BulgariaSat-1 BulgariaSat-1 GTO Bulsatcom Success Success 37 25 June 2017 20:25 F9 FT VAFB Iridium NEXT Iridium NEXT LEO Iridium Communications Success Success 38 5 July 2017 23:38 F9 FT KSC Intelsat 35e Intelsat 35e GTO Intelsat Success Not attempted 39 14 August 2017 16:31 F9 B4 KSC SpaceX CRS-12 SpaceX CRS-12 LEO NASA Success Success 40 24 August 2017 18:51 F9 FT VAFB Formosat-5 Formosat-5 SSO NSPO Success Success 41 7 September 2017 14:00 F9 B4 KSC Boeing X-37B Boeing X-37B LEO USAF Success Success 42 9 October 2017 12:37 F9 B4 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Success 43 11 October 2017 22:53:00 F9 FTB1031.2 KSC SES-11 SES-11 GTO SES S.A. Success Success 44 30 October 2017 19:34 F9 B4 KSC Koreasat 5A Koreasat 5A GTO KT Corporation Success Success 45 15 December 2017 15:36 F9 FTB1035.2 Cape Canaveral SpaceX CRS-13 SpaceX CRS-13 LEO NASA Success Success 46 23 December 2017 01:27 F9 FTB1036.2 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Controlled 47 8 January 2018 01:00 F9 B4 CCAFS Zuma Zuma LEO Northrop Grumman Success Success 48 31 January 2018 21:25 F9 FTB1032.2 CCAFS GovSat-1 GovSat-1 GTO SES Success Controlled 49 22 February 2018 14:17 F9 FTB1038.2 VAFB Paz Paz SSO Hisdesat Success Not attempted 50 6 March 2018 05:33 F9 B4 CCAFS Hispasat 30W-6 Hispasat 30W-6 GTO Hispasat Success No attempt 51 30 March 2018 14:14 F9 B4B1041.2 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success No attempt 52 2 April 2018 20:30 F9 B4B1039.2 CCAFS SpaceX CRS-14 SpaceX CRS-14 LEO NASA Success Not attempted 53 18 April 2018 22:51 F9 B4 CCAFS Transiting Exoplanet Survey Satellite Transiting Exoplanet Survey Satellite HEO NASA Success Success 54 11 May 2018 20:14 F9 B5B1046.1 KSC Bangabandhu-1 Bangabandhu-1 GTO Thales-Alenia Success Success 55 22 May 2018 19:47 F9 B4B1043.2 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Not attempted 56 4 June 2018 04:45 F9 B4B1040.2 CCAFS SES-12 SES-12 GTO SES Success Not attempted 57 29 June 2018 09:42 F9 B4B1045.2 CCAFS SpaceX CRS-15 SpaceX CRS-15 LEO NASA Success Not attempted 58 22 July 2018 05:50 F9 B5 CCAFS Telstar 19V Telstar 19V GTO Telesat Success Success 59 25 July 2018 11:39 F9 B5B1048 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Success 60 7 August 2018 05:18 F9 B5B1046.2 CCAFS Merah Putih Merah Putih GTO Telkom Indonesia Success Success 61 10 September 2018 04:45 F9 B5 CCAFS Telstar 18V Telstar 18V GTO Telesat Success Success 62 8 October 2018 02:22 F9 B5B1048.2 VAFB SAOCOM 1A SAOCOM 1A SSO CONAE Success Success 63 15 November 2018 20:46 F9 B5B1047.2 KSC Es'hail 2 Es'hail 2 GTO Es'hailSat Success Success 64 3 December 2018 18:34:05 F9 B5B1046.3 VAFB SSO-A SSO-A SSO Spaceflight Industries Success Success 65 5 December 2018 18:16 F9 B5 CCAFS SpaceX CRS-16 SpaceX CRS-16 LEO NASA Success Failure 66 23 December 2018 13:51 F9 B5 CCAFS GPS III GPS III MEO USAF Success Not attempted 67 11 January 2019 15:31 F9 B5B1049.2 VAFB Iridium NEXT Iridium NEXT Polar Iridium Communications Success Success 68 22 February 2019 01:45 F9 B5B1048.3 CCAFS Nusantara Satu Nusantara Satu GTO PSN Success Success 69 2 March 2019 07:49 F9 B5[268] KSC Crew Dragon Demo-1 Crew Dragon Demo-1 LEO NASA Success Success 70 4 May 2019 06:48 F9 B5 CCAFS SpaceX CRS-17 SpaceX CRS-17 LEO NASA Success Success 71 24 May 2019 02:30 F9 B5B1049.3 CCAFS Starlink Starlink LEO SpaceX Success Success 72 12 June 2019 14:17 F9 B5B1051.2 VAFB RADARSAT Constellation RADARSAT Constellation SSO Canadian Space Agency Success Success 73 25 July 2019 22:01 F9 B5B1056.2 CCAFS SpaceX CRS-18 SpaceX CRS-18 LEO NASA Success Success 74 6 August 2019 23:23 F9 B5B1047.3 CCAFS AMOS-17 AMOS-17 GTO Spacecom Success Not attempted 75 11 November 2019 14:56 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 76 5 December 2019 17:29 F9 B5 CCAFS SpaceX CRS-19 SpaceX CRS-19 LEO NASA Success Success 77 17 December 2019 00:10 F9 B5B1056.3 CCAFS JCSat-18 JCSat-18 GTO Sky Perfect JSAT Success Success 78 7 January 2020 02:19:21 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 79 19 January 2020 15:30 F9 B5 KSC Crew Dragon in-flight abort test Crew Dragon in-flight abort test Sub-orbital NASA Success Not attempted 80 29 January 2020 14:07 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 81 17 February 2020 15:05 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Failure 82 7 March 2020 04:50 F9 B5 CCAFS SpaceX CRS-20 SpaceX CRS-20 LEO NASA Success Success 83 18 March 2020 12:16 F9 B5 KSC Starlink Starlink LEO SpaceX Success Failure 84 22 April 2020 19:30 F9 B5 KSC Starlink Starlink LEO SpaceX Success Success 85 30 May 2020 19:22 F9 B5 KSC Crew Dragon Demo-2 Crew Dragon Demo-2 LEO NASA Success Success 86 4 June 2020 01:25 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 87 13 June 2020 09:21 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 88 30 June 2020 20:10:46 F9 B5 CCAFS GPS III GPS III MEO U.S. Space Force Success Success 89 20 July 2020 21:30 F9 B5B1058.2 CCAFS ANASIS-II ANASIS-II GTO Republic of Korea Army Success Success 90 7 August 2020 05:12 F9 B5 KSC Starlink Starlink LEO SpaceX Success Success 91 18 August 2020 14:31 F9 B5B1049.6 CCAFS Starlink Starlink LEO SpaceX Success Success 92 30 August 2020 23:18 F9 B5 CCAFS SAOCOM 1B SAOCOM 1B SSO CONAE Success Success 93 3 September 2020 12:46:14 F9 B5B1060.2 KSC Starlink Starlink LEO SpaceX Success Success 94 6 October 2020 11:29:34 F9 B5B1058.3 KSC Starlink Starlink LEO SpaceX Success Success 95 18 October 2020 12:25:57 F9 B5B1051.6 KSC Starlink Starlink LEO SpaceX Success Success 96 24 October 2020 15:31:34 F9 B5 CCAFS Starlink Starlink LEO SpaceX Success Success 97 5 November 2020 23:24:23 F9 B5 CCAFS GPS III GPS III MEO USSF Success Success 98 16 November 2020 00:27 F9 B5 KSC Crew-1 Crew-1 LEO NASA Success Success 99 21 November 2020 17:17:08 F9 B5 VAFB Sentinel-6 Michael Freilich (Jason-CS A) Sentinel-6 Michael Freilich (Jason-CS A) LEO NASA Success Success 100 25 November 2020 02:13 F9 B5 ♺ CCAFS Starlink Starlink LEO SpaceX Success Success 101 6 December 2020 16:17:08 F9 B5 ♺ KSC SpaceX CRS-21 SpaceX CRS-21 LEO NASA Success Success 102 13 December 2020 17:30:00 F9 B5 ♺ CCSFS SXM-7 SXM-7 GTO Sirius XM Success Success 103 19 December 2020 14:00:00 F9 B5 ♺ KSC NROL-108 NROL-108 LEO NRO Success Success 104 8 January 2021 02:15 F9 B5 CCSFS Türksat 5A Türksat 5A GTO Türksat Success Success 105 20 January 2021 13:02 F9 B5B1051.8 KSC Starlink Starlink LEO SpaceX Success Success 106 24 January 2021 15:00 F9 B5B1058.5 CCSFS Transporter-1 Transporter-1 SSO
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) /tmp/ipykernel_1129/2140478783.py in <cell line: 25>() 67 print(orbit) 68 ---> 69 customer = row[6].a.string 70 dict2['Customer'].append(customer) 71 print(customer) AttributeError: 'NoneType' object has no attribute 'string'
df2= pd.DataFrame({ key:pd.Series(value) for key, value in dict2.items() })
# Exporting our dataframe to CSV df2.to_csv('dataset_p2_Webscraping.csv', index=False)