{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Data Collection via Webscraping from SpaceX Wikipedia" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Importing necessary libraries\n", "\n", "import sys\n", "from bs4 import BeautifulSoup\n", "import re\n", "import unicodedata\n", "import requests\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Function(s) to parse web scraped HTML table\n", "\n", "def dt(table_cells):\n", " return [data_time.strip() for data_time in list(table_cells.strings)][0:2]\n", "\n", "def bv(table_cells):\n", " out=''.join([booster_version for i, booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])\n", " return out \n", "\n", "def lnds(table_cells):\n", " out=[i for i in table_cells.strings][0]\n", " return out\n", "\n", "def g_ms(table_cells):\n", " mass=unicodedata.normalize(\"NFKD\", table_cells.text).strip()\n", " if mass:\n", " mass.find(\"kg\")\n", " new_mass=mass[0:mass.find(\"kg\")+2]\n", " else:\n", " new_mass=0\n", " return new_mass\n", "\n", "def exColhd(row):\n", " if (row.br):\n", " row.br.extract()\n", " if row.a:\n", " row.a.extract()\n", " if row.sup:\n", " row.sup.extract()\n", " \n", " column_name = ' '.join(row.contents)\n", " \n", " if not(column_name.strip().isdigit()):\n", " column_name = column_name.strip()\n", " return column_name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "List of Falcon 9 and Falcon Heavy launches - Wikipedia" ] }, "execution_count": 5, "metadata": { }, "output_type": "execute_result" } ], "source": [ "# Requesting Falcon9 Launch Wiki page from URL \n", "\n", "url = 'https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922'\n", "rsp = requests.get(url).text\n", "soup = BeautifulSoup(rsp, 'html.parser')\n", "soup.title" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']\n" ] } ], "source": [ "# Getting all Columns/Variables from the HTML table header\n", "\n", "html_tables = soup.find_all('table')\n", "\n", "# Extracting from the third table onward\n", "t_targ = html_tables[2]\n", "column_names = []\n", "t_head = t_targ.find_all('th')\n", "\n", "for th in t_head:\n", " column_name = exColhd(th)\n", " if column_name is not None and len(column_name) > 0:\n", " column_names.append(column_name)\n", " \n", " \n", "print(column_names)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "4 June 2010\n", "18:45\n", "F9 v1.0B0003.1\n", "CCAFS\n", "Dragon Spacecraft Qualification Unit\n", "Dragon Spacecraft Qualification Unit\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Failure\n", "2\n", "8 December 2010\n", "15:43\n", "F9 v1.0B0004.1\n", "CCAFS\n", "Dragon\n", "Dragon\n", "LEO\n", "NASA\n", "Success\n", "Failure\n", "3\n", "22 May 2012\n", "07:44\n", "F9 v1.0B0005.1\n", "CCAFS\n", "Dragon\n", "Dragon\n", "LEO\n", "NASA\n", "Success\n", "Not attempted\n", "\n", "4\n", "8 October 2012\n", "00:35\n", "F9 v1.0B0006.1\n", "CCAFS\n", "SpaceX CRS-1\n", "SpaceX CRS-1\n", "LEO\n", "NASA\n", "Success\n", "\n", "No attempt\n", "5\n", "1 March 2013\n", "15:10\n", "F9 v1.0B0007.1\n", "CCAFS\n", "SpaceX CRS-2\n", "SpaceX CRS-2\n", "LEO\n", "NASA\n", "Success\n", "\n", "Not attempted\n", "\n", "6\n", "29 September 2013\n", "16:00\n", "F9 v1.1B1003\n", "VAFB\n", "CASSIOPE\n", "CASSIOPE\n", "Polar orbit\n", "MDA\n", "Success\n", "Uncontrolled\n", "7\n", "3 December 2013\n", "22:41\n", "\n", "CCAFS\n", "SES-8\n", "SES-8\n", "GTO\n", "SES\n", "Success\n", "Not attempted\n", "8\n", "6 January 2014\n", "22:06\n", "\n", "CCAFS\n", "Thaicom 6\n", "Thaicom 6\n", "GTO\n", "Thaicom\n", "Success\n", "Not attempted\n", "9\n", "18 April 2014\n", "19:25\n", "\n", "Cape Canaveral\n", "SpaceX CRS-3\n", "SpaceX CRS-3\n", "LEO\n", "NASA\n", "Success\n", "\n", "Controlled\n", "10\n", "14 July 2014\n", "15:15\n", "\n", "Cape Canaveral\n", "Orbcomm-OG2\n", "Orbcomm-OG2\n", "LEO\n", "Orbcomm\n", "Success\n", "Controlled\n", "11\n", "5 August 2014\n", "08:00\n", "\n", "Cape Canaveral\n", "AsiaSat 8\n", "AsiaSat 8\n", "GTO\n", "AsiaSat\n", "Success\n", "Not attempted\n", "12\n", "7 September 2014\n", "05:00\n", "F9 v1.1\n", "Cape Canaveral\n", "AsiaSat 6\n", "AsiaSat 6\n", "GTO\n", "AsiaSat\n", "Success\n", "Not attempted\n", "\n", "13\n", "21 September 2014\n", "05:52\n", "F9 v1.1\n", "Cape Canaveral\n", "SpaceX CRS-4\n", "SpaceX CRS-4\n", "LEO\n", "NASA\n", "Success\n", "Uncontrolled\n", "14\n", "10 January 2015\n", "09:47\n", "F9 v1.1\n", "Cape Canaveral\n", "SpaceX CRS-5\n", "SpaceX CRS-5\n", "LEO\n", "NASA\n", "Success\n", "Failure \n", "15\n", "11 February 2015\n", "23:03\n", "F9 v1.1\n", "Cape Canaveral\n", "DSCOVR\n", "DSCOVR\n", "HEO\n", "USAF\n", "Success\n", "\n", "Controlled\n", "16\n", "2 March 2015\n", "03:50\n", "F9 v1.1\n", "Cape Canaveral\n", "ABS-3A\n", "ABS-3A\n", "GTO\n", "ABS\n", "Success\n", "\n", "Not attempted\n", "17\n", "14 April 2015\n", "20:10\n", "F9 v1.1\n", "Cape Canaveral\n", "SpaceX CRS-6\n", "SpaceX CRS-6\n", "LEO\n", "NASA\n", "Success\n", "\n", "Failure\n", "18\n", "27 April 2015\n", "23:03\n", "F9 v1.1\n", "Cape Canaveral\n", "TürkmenÄlem 52°E / MonacoSAT\n", "TürkmenÄlem 52°E / MonacoSAT\n", "GTO\n", "None\n", "Success\n", "\n", "Not attempted\n", "19\n", "28 June 2015\n", "14:21\n", "F9 v1.1\n", "Cape Canaveral\n", "SpaceX CRS-7\n", "SpaceX CRS-7\n", "LEO\n", "NASA\n", "Failure\n", "Precluded\n", "20\n", "22 December 2015\n", "01:29\n", "F9 FT\n", "Cape Canaveral\n", "Orbcomm-OG2\n", "Orbcomm-OG2\n", "LEO\n", "Orbcomm\n", "Success\n", "\n", "Success\n", "21\n", "17 January 2016\n", "18:42\n", "F9 v1.1\n", "VAFB\n", "Jason-3\n", "Jason-3\n", "LEO\n", "NASA\n", "Success\n", "\n", "Failure\n", "22\n", "4 March 2016\n", "23:35\n", "F9 FT\n", "Cape Canaveral\n", "SES-9\n", "SES-9\n", "GTO\n", "SES\n", "Success\n", "\n", "Failure\n", "23\n", "8 April 2016\n", "20:43\n", "F9 FT\n", "Cape Canaveral\n", "SpaceX CRS-8\n", "SpaceX CRS-8\n", "LEO\n", "NASA\n", "Success\n", "Success\n", "24\n", "6 May 2016\n", "05:21\n", "F9 FT\n", "Cape Canaveral\n", "JCSAT-14\n", "JCSAT-14\n", "GTO\n", "SKY Perfect JSAT Group\n", "Success\n", "\n", "Success\n", "25\n", "27 May 2016\n", "21:39\n", "F9 FT\n", "Cape Canaveral\n", "Thaicom 8\n", "Thaicom 8\n", "GTO\n", "Thaicom\n", "Success\n", "\n", "Success\n", "26\n", "15 June 2016\n", "14:29\n", "F9 FT\n", "Cape Canaveral\n", "ABS-2A\n", "ABS-2A\n", "GTO\n", "ABS\n", "Success\n", "\n", "Failure\n", "27\n", "18 July 2016\n", "04:45\n", "F9 FT\n", "Cape Canaveral\n", "SpaceX CRS-9\n", "SpaceX CRS-9\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "28\n", "14 August 2016\n", "05:26\n", "F9 FT\n", "Cape Canaveral\n", "JCSAT-16\n", "JCSAT-16\n", "GTO\n", "SKY Perfect JSAT Group\n", "Success\n", "\n", "Success\n", "29\n", "14 January 2017\n", "17:54\n", "F9 FT\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "\n", "Success\n", "30\n", "19 February 2017\n", "14:39\n", "F9 FT\n", "KSC\n", "SpaceX CRS-10\n", "SpaceX CRS-10\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "31\n", "16 March 2017\n", "06:00\n", "F9 FT\n", "KSC\n", "EchoStar 23\n", "EchoStar 23\n", "GTO\n", "EchoStar\n", "Success\n", "\n", "Not attempted\n", "32\n", "30 March 2017\n", "22:27\n", "F9 FT♺\n", "KSC\n", "SES-10\n", "SES-10\n", "GTO\n", "SES\n", "Success\n", "Success\n", "33\n", "1 May 2017\n", "11:15\n", "F9 FT\n", "KSC\n", "NROL-76\n", "NROL-76\n", "LEO\n", "NRO\n", "Success\n", "\n", "Success\n", "34\n", "15 May 2017\n", "23:21\n", "F9 FT\n", "KSC\n", "Inmarsat-5 F4\n", "Inmarsat-5 F4\n", "GTO\n", "Inmarsat\n", "Success\n", "\n", "Not attempted\n", "35\n", "3 June 2017\n", "21:07\n", "F9 FT\n", "KSC\n", "SpaceX CRS-11\n", "SpaceX CRS-11\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "36\n", "23 June 2017\n", "19:10\n", "F9 FTB1029.2\n", "KSC\n", "BulgariaSat-1\n", "BulgariaSat-1\n", "GTO\n", "Bulsatcom\n", "Success\n", "\n", "Success\n", "37\n", "25 June 2017\n", "20:25\n", "F9 FT\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "LEO\n", "Iridium Communications\n", "Success\n", "\n", "Success\n", "38\n", "5 July 2017\n", "23:38\n", "F9 FT\n", "KSC\n", "Intelsat 35e\n", "Intelsat 35e\n", "GTO\n", "Intelsat\n", "Success\n", "\n", "Not attempted\n", "39\n", "14 August 2017\n", "16:31\n", "F9 B4\n", "KSC\n", "SpaceX CRS-12\n", "SpaceX CRS-12\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "40\n", "24 August 2017\n", "18:51\n", "F9 FT\n", "VAFB\n", "Formosat-5\n", "Formosat-5\n", "SSO\n", "NSPO\n", "Success\n", "\n", "Success\n", "41\n", "7 September 2017\n", "14:00\n", "F9 B4\n", "KSC\n", "Boeing X-37B\n", "Boeing X-37B\n", "LEO\n", "USAF\n", "Success\n", "\n", "Success\n", "42\n", "9 October 2017\n", "12:37\n", "F9 B4\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "\n", "Success\n", "43\n", "11 October 2017\n", "22:53:00\n", "F9 FTB1031.2\n", "KSC\n", "SES-11\n", "SES-11\n", "GTO\n", "SES S.A.\n", "Success\n", "\n", "Success\n", "44\n", "30 October 2017\n", "19:34\n", "F9 B4\n", "KSC\n", "Koreasat 5A\n", "Koreasat 5A\n", "GTO\n", "KT Corporation\n", "Success\n", "\n", "Success\n", "45\n", "15 December 2017\n", "15:36\n", "F9 FTB1035.2\n", "Cape Canaveral\n", "SpaceX CRS-13\n", "SpaceX CRS-13\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "46\n", "23 December 2017\n", "01:27\n", "F9 FTB1036.2\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "Controlled\n", "47\n", "8 January 2018\n", "01:00\n", "F9 B4\n", "CCAFS\n", "Zuma\n", "Zuma\n", "LEO\n", "Northrop Grumman\n", "Success\n", "Success\n", "48\n", "31 January 2018\n", "21:25\n", "F9 FTB1032.2\n", "CCAFS\n", "GovSat-1\n", "GovSat-1\n", "GTO\n", "SES\n", "Success\n", "Controlled\n", "49\n", "22 February 2018\n", "14:17\n", "F9 FTB1038.2\n", "VAFB\n", "Paz\n", "Paz\n", "SSO\n", "Hisdesat\n", "Success\n", "Not attempted\n", "50\n", "6 March 2018\n", "05:33\n", "F9 B4\n", "CCAFS\n", "Hispasat 30W-6\n", "Hispasat 30W-6\n", "GTO\n", "Hispasat\n", "Success\n", "No attempt\n", "51\n", "30 March 2018\n", "14:14\n", "F9 B4B1041.2\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "No attempt\n", "52\n", "2 April 2018\n", "20:30\n", "F9 B4B1039.2\n", "CCAFS\n", "SpaceX CRS-14\n", "SpaceX CRS-14\n", "LEO\n", "NASA\n", "Success\n", "Not attempted\n", "53\n", "18 April 2018\n", "22:51\n", "F9 B4\n", "CCAFS\n", "Transiting Exoplanet Survey Satellite\n", "Transiting Exoplanet Survey Satellite\n", "HEO\n", "NASA\n", "Success\n", "Success\n", "54\n", "11 May 2018\n", "20:14\n", "F9 B5B1046.1\n", "KSC\n", "Bangabandhu-1\n", "Bangabandhu-1\n", "GTO\n", "Thales-Alenia\n", "Success\n", "Success\n", "55\n", "22 May 2018\n", "19:47\n", "F9 B4B1043.2\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "Not attempted\n", "56\n", "4 June 2018\n", "04:45\n", "F9 B4B1040.2\n", "CCAFS\n", "SES-12\n", "SES-12\n", "GTO\n", "SES\n", "Success\n", "Not attempted\n", "57\n", "29 June 2018\n", "09:42\n", "F9 B4B1045.2\n", "CCAFS\n", "SpaceX CRS-15\n", "SpaceX CRS-15\n", "LEO\n", "NASA\n", "Success\n", "Not attempted\n", "58\n", "22 July 2018\n", "05:50\n", "F9 B5\n", "CCAFS\n", "Telstar 19V\n", "Telstar 19V\n", "GTO\n", "Telesat\n", "Success\n", "Success\n", "59\n", "25 July 2018\n", "11:39\n", "F9 B5B1048\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "Success\n", "60\n", "7 August 2018\n", "05:18\n", "F9 B5B1046.2\n", "CCAFS\n", "Merah Putih\n", "Merah Putih\n", "GTO\n", "Telkom Indonesia\n", "Success\n", "Success\n", "61\n", "10 September 2018\n", "04:45\n", "F9 B5\n", "CCAFS\n", "Telstar 18V\n", "Telstar 18V\n", "GTO\n", "Telesat\n", "Success\n", "Success\n", "62\n", "8 October 2018\n", "02:22\n", "F9 B5B1048.2\n", "VAFB\n", "SAOCOM 1A\n", "SAOCOM 1A\n", "SSO\n", "CONAE\n", "Success\n", "Success\n", "63\n", "15 November 2018\n", "20:46\n", "F9 B5B1047.2\n", "KSC\n", "Es'hail 2\n", "Es'hail 2\n", "GTO\n", "Es'hailSat\n", "Success\n", "Success\n", "64\n", "3 December 2018\n", "18:34:05\n", "F9 B5B1046.3\n", "VAFB\n", "SSO-A\n", "SSO-A\n", "SSO\n", "Spaceflight Industries\n", "Success\n", "Success\n", "65\n", "5 December 2018\n", "18:16\n", "F9 B5\n", "CCAFS\n", "SpaceX CRS-16\n", "SpaceX CRS-16\n", "LEO\n", "NASA\n", "Success\n", "\n", "Failure\n", "66\n", "23 December 2018\n", "13:51\n", "F9 B5\n", "CCAFS\n", "GPS III\n", "GPS III\n", "MEO\n", "USAF\n", "Success\n", "Not attempted\n", "67\n", "11 January 2019\n", "15:31\n", "F9 B5B1049.2\n", "VAFB\n", "Iridium NEXT\n", "Iridium NEXT\n", "Polar\n", "Iridium Communications\n", "Success\n", "\n", "Success\n", "68\n", "22 February 2019\n", "01:45\n", "F9 B5B1048.3\n", "CCAFS\n", "Nusantara Satu\n", "Nusantara Satu\n", "GTO\n", "PSN\n", "Success\n", "\n", "Success\n", "69\n", "2 March 2019\n", "07:49\n", "F9 B5[268]\n", "KSC\n", "Crew Dragon Demo-1\n", "Crew Dragon Demo-1\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "70\n", "4 May 2019\n", "06:48\n", "F9 B5\n", "CCAFS\n", "SpaceX CRS-17\n", "SpaceX CRS-17\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "71\n", "24 May 2019\n", "02:30\n", "F9 B5B1049.3\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "72\n", "12 June 2019\n", "14:17\n", "F9 B5B1051.2\n", "VAFB\n", "RADARSAT Constellation\n", "RADARSAT Constellation\n", "SSO\n", "Canadian Space Agency\n", "Success\n", "\n", "Success\n", "73\n", "25 July 2019\n", "22:01\n", "F9 B5B1056.2\n", "CCAFS\n", "SpaceX CRS-18\n", "SpaceX CRS-18\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "74\n", "6 August 2019\n", "23:23\n", "F9 B5B1047.3\n", "CCAFS\n", "AMOS-17\n", "AMOS-17\n", "GTO\n", "Spacecom\n", "Success\n", "\n", "Not attempted\n", "75\n", "11 November 2019\n", "14:56\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "76\n", "5 December 2019\n", "17:29\n", "F9 B5\n", "CCAFS\n", "SpaceX CRS-19\n", "SpaceX CRS-19\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "77\n", "17 December 2019\n", "00:10\n", "F9 B5B1056.3\n", "CCAFS\n", "JCSat-18\n", "JCSat-18\n", "GTO\n", "Sky Perfect JSAT\n", "Success\n", "\n", "Success\n", "78\n", "7 January 2020\n", "02:19:21\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "79\n", "19 January 2020\n", "15:30\n", "F9 B5\n", "KSC\n", "Crew Dragon in-flight abort test\n", "Crew Dragon in-flight abort test\n", "Sub-orbital\n", "NASA\n", "Success\n", "\n", "Not attempted\n", "\n", "80\n", "29 January 2020\n", "14:07\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "81\n", "17 February 2020\n", "15:05\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Failure\n", "82\n", "7 March 2020\n", "04:50\n", "F9 B5\n", "CCAFS\n", "SpaceX CRS-20\n", "SpaceX CRS-20\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "83\n", "18 March 2020\n", "12:16\n", "F9 B5\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Failure\n", "84\n", "22 April 2020\n", "19:30\n", "F9 B5\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "85\n", "30 May 2020\n", "19:22\n", "F9 B5\n", "KSC\n", "Crew Dragon Demo-2\n", "Crew Dragon Demo-2\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "86\n", "4 June 2020\n", "01:25\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "87\n", "13 June 2020\n", "09:21\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "88\n", "30 June 2020\n", "20:10:46\n", "F9 B5\n", "CCAFS\n", "GPS III\n", "GPS III\n", "MEO\n", "U.S. Space Force\n", "Success\n", "\n", "Success\n", "89\n", "20 July 2020\n", "21:30\n", "F9 B5B1058.2\n", "CCAFS\n", "ANASIS-II\n", "ANASIS-II\n", "GTO\n", "Republic of Korea Army\n", "Success\n", "\n", "Success\n", "90\n", "7 August 2020\n", "05:12\n", "F9 B5\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "91\n", "18 August 2020\n", "14:31\n", "F9 B5B1049.6\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "92\n", "30 August 2020\n", "23:18\n", "F9 B5\n", "CCAFS\n", "SAOCOM 1B\n", "SAOCOM 1B\n", "SSO\n", "CONAE\n", "Success\n", "\n", "Success\n", "93\n", "3 September 2020\n", "12:46:14\n", "F9 B5B1060.2\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "94\n", "6 October 2020\n", "11:29:34\n", "F9 B5B1058.3\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "95\n", "18 October 2020\n", "12:25:57\n", "F9 B5B1051.6\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "96\n", "24 October 2020\n", "15:31:34\n", "F9 B5\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "97\n", "5 November 2020\n", "23:24:23\n", "F9 B5\n", "CCAFS\n", "GPS III\n", "GPS III\n", "MEO\n", "USSF\n", "Success\n", "\n", "Success\n", "98\n", "16 November 2020\n", "00:27\n", "F9 B5\n", "KSC\n", "Crew-1\n", "Crew-1\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "99\n", "21 November 2020\n", "17:17:08\n", "F9 B5\n", "VAFB\n", "Sentinel-6 Michael Freilich (Jason-CS A)\n", "Sentinel-6 Michael Freilich (Jason-CS A)\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "100\n", "25 November 2020\n", "02:13\n", "F9 B5 ♺\n", "CCAFS\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "101\n", "6 December 2020\n", "16:17:08\n", "F9 B5 ♺\n", "KSC\n", "SpaceX CRS-21\n", "SpaceX CRS-21\n", "LEO\n", "NASA\n", "Success\n", "\n", "Success\n", "102\n", "13 December 2020\n", "17:30:00\n", "F9 B5 ♺\n", "CCSFS\n", "SXM-7\n", "SXM-7\n", "GTO\n", "Sirius XM\n", "Success\n", "\n", "Success\n", "103\n", "19 December 2020\n", "14:00:00\n", "F9 B5 ♺\n", "KSC\n", "NROL-108\n", "NROL-108\n", "LEO\n", "NRO\n", "Success\n", "\n", "Success\n", "104\n", "8 January 2021\n", "02:15\n", "F9 B5\n", "CCSFS\n", "Türksat 5A\n", "Türksat 5A\n", "GTO\n", "Türksat\n", "Success\n", "\n", "Success\n", "105\n", "20 January 2021\n", "13:02\n", "F9 B5B1051.8\n", "KSC\n", "Starlink\n", "Starlink\n", "LEO\n", "SpaceX\n", "Success\n", "\n", "Success\n", "106\n", "24 January 2021\n", "15:00\n", "F9 B5B1058.5\n", "CCSFS\n", "Transporter-1\n", "Transporter-1\n", "SSO\n" ] }, { "ename": "AttributeError", "evalue": "'NoneType' object has no attribute 'string'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_1129/2140478783.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morbit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0mcustomer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0mdict2\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Customer'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcustomer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcustomer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'string'" ] } ], "source": [ "# Creating a dataframe by parsing the HTML tables\n", "\n", "dict2 = dict.fromkeys(column_names)\n", "\n", "del dict2['Date and time ( )']\n", "\n", "# Initializing the dictionary to have each value be an empty list\n", "\n", "dict2['Flight No.'] = []\n", "dict2['Launch site'] = []\n", "dict2['Payload'] = []\n", "dict2['Payload mass'] = []\n", "dict2['Orbit'] = []\n", "dict2['Customer'] = []\n", "dict2['Launch outcome'] = []\n", "dict2['Version Booster']=[]\n", "dict2['Booster landing']=[]\n", "dict2['Date']=[]\n", "dict2['Time']=[]\n", "\n", "# Parsing the HTML table to fill our dictionary \n", "\n", "extracted_row = 0\n", "\n", "for table_number, table in enumerate(soup.find_all('table',\"wikitable plainrowheaders collapsible\")):\n", " for rows in table.find_all(\"tr\"):\n", " if rows.th:\n", " if rows.th.string:\n", " flight_number = rows.th.string.strip()\n", " flag = flight_number.isdigit()\n", " else:\n", " flag=False\n", " row=rows.find_all(\"td\")\n", " if flag:\n", " extracted_row += 1\n", " \n", " dict2['Flight No.'].append(flight_number)\n", " print(flight_number)\n", " datatimelist=dt(row[0])\n", " \n", " date = datatimelist[0].strip(',')\n", " dict2['Date'].append(date)\n", " print(date)\n", " \n", " time = datatimelist[1]\n", " dict2['Time'].append(time)\n", " print(time)\n", " \n", " bsv = bv(row[1])\n", " dict2['Version Booster'].append(bsv)\n", " print(bsv)\n", " \n", " launch_site = row[2].a.string\n", " dict2['Launch site'].append(launch_site)\n", " print(launch_site)\n", " \n", " payload = row[3].a.string\n", " dict2['Payload'].append(payload)\n", " print(payload)\n", " \n", " payload_mass = g_ms(row[4])\n", " dict2['Payload mass'].append(payload_mass)\n", " print(payload)\n", " \n", " orbit = row[5].a.string\n", " dict2['Orbit'].append(orbit)\n", " print(orbit)\n", " \n", " customer = row[6].a.string\n", " dict2['Customer'].append(customer)\n", " print(customer)\n", " \n", " launch_outcome = list(row[7].strings)[0]\n", " dict2['Launch outcome'].append(launch_outcome)\n", " print(launch_outcome)\n", " \n", " booster_landing = lnds(row[8])\n", " dict2['Booster landing'].append(booster_landing)\n", " print(booster_landing)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df2= pd.DataFrame({ key:pd.Series(value) for key, value in dict2.items() })" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Exporting our dataframe to CSV\n", "\n", "df2.to_csv('dataset_p2_Webscraping.csv', index=False)" ] } ], "metadata": { "kernelspec": { "argv": [ "/usr/bin/python3", "-m", "ipykernel", "--HistoryManager.enabled=False", "--matplotlib=inline", "-c", "%config InlineBackend.figure_formats = set(['retina'])\nimport matplotlib; matplotlib.rcParams['figure.figsize'] = (12, 7)", "-f", "{connection_file}" ], "display_name": "Python 3 (system-wide)", "env": { }, "language": "python", "metadata": { "cocalc": { "description": "Python 3 programming language", "priority": 100, "url": "https://www.python.org/" } }, "name": "python3", "resource_dir": "/ext/jupyter/kernels/python3" } }, "nbformat": 4, "nbformat_minor": 4 }