{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "## Data Collection via Webscraping from SpaceX Wikipedia" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Importing necessary libraries\n", "\n", "import sys\n", "from bs4 import BeautifulSoup\n", "import re\n", "import unicodedata\n", "import requests\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# Function(s) to parse web scraped HTML table\n", "\n", "def dt(table_cells):\n", " return [data_time.strip() for data_time in list(table_cells.strings)][0:2]\n", "\n", "def bv(table_cells):\n", " out=''.join([booster_version for i, booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])\n", " return out \n", "\n", "def lnds(table_cells):\n", " out=[i for i in table_cells.strings][0]\n", " return out\n", "\n", "def g_ms(table_cells):\n", " mass=unicodedata.normalize(\"NFKD\", table_cells.text).strip()\n", " if mass:\n", " mass.find(\"kg\")\n", " new_mass=mass[0:mass.find(\"kg\")+2]\n", " else:\n", " new_mass=0\n", " return new_mass\n", "\n", "def exColhd(row):\n", " if (row.br):\n", " row.br.extract()\n", " if row.a:\n", " row.a.extract()\n", " if row.sup:\n", " row.sup.extract()\n", " \n", " column_name = ' '.join(row.contents)\n", " \n", " if not(column_name.strip().isdigit()):\n", " column_name = column_name.strip()\n", " return column_name" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "