{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"## Data Wrangling"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('display.max_colwidth', None)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"### Data Analysis\n",
"Loading in our SpaceX dataset"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" FlightNumber | \n",
" Date | \n",
" BoosterVersion | \n",
" PayloadMass | \n",
" Orbit | \n",
" LaunchSite | \n",
" Outcome | \n",
" Flights | \n",
" GridFins | \n",
" Reused | \n",
" Legs | \n",
" LandingPad | \n",
" Block | \n",
" ReusedCount | \n",
" Serial | \n",
" Longitude | \n",
" Latitude | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2010-06-04 | \n",
" Falcon 9 | \n",
" 6104.959412 | \n",
" LEO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0003 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2012-05-22 | \n",
" Falcon 9 | \n",
" 525.000000 | \n",
" LEO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0005 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 2013-03-01 | \n",
" Falcon 9 | \n",
" 677.000000 | \n",
" ISS | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0007 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 2013-09-29 | \n",
" Falcon 9 | \n",
" 500.000000 | \n",
" PO | \n",
" VAFB SLC 4E | \n",
" False Ocean | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1003 | \n",
" -120.610829 | \n",
" 34.632093 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 2013-12-03 | \n",
" Falcon 9 | \n",
" 3170.000000 | \n",
" GTO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1004 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" 2014-01-06 | \n",
" Falcon 9 | \n",
" 3325.000000 | \n",
" GTO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1005 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 6 | \n",
" 7 | \n",
" 2014-04-18 | \n",
" Falcon 9 | \n",
" 2296.000000 | \n",
" ISS | \n",
" CCAFS SLC 40 | \n",
" True Ocean | \n",
" 1 | \n",
" False | \n",
" False | \n",
" True | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1006 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 7 | \n",
" 8 | \n",
" 2014-07-14 | \n",
" Falcon 9 | \n",
" 1316.000000 | \n",
" LEO | \n",
" CCAFS SLC 40 | \n",
" True Ocean | \n",
" 1 | \n",
" False | \n",
" False | \n",
" True | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1007 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 8 | \n",
" 9 | \n",
" 2014-08-05 | \n",
" Falcon 9 | \n",
" 4535.000000 | \n",
" GTO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1008 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
" 9 | \n",
" 10 | \n",
" 2014-09-07 | \n",
" Falcon 9 | \n",
" 4428.000000 | \n",
" GTO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1011 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" FlightNumber Date BoosterVersion PayloadMass Orbit LaunchSite \\\n",
"0 1 2010-06-04 Falcon 9 6104.959412 LEO CCAFS SLC 40 \n",
"1 2 2012-05-22 Falcon 9 525.000000 LEO CCAFS SLC 40 \n",
"2 3 2013-03-01 Falcon 9 677.000000 ISS CCAFS SLC 40 \n",
"3 4 2013-09-29 Falcon 9 500.000000 PO VAFB SLC 4E \n",
"4 5 2013-12-03 Falcon 9 3170.000000 GTO CCAFS SLC 40 \n",
"5 6 2014-01-06 Falcon 9 3325.000000 GTO CCAFS SLC 40 \n",
"6 7 2014-04-18 Falcon 9 2296.000000 ISS CCAFS SLC 40 \n",
"7 8 2014-07-14 Falcon 9 1316.000000 LEO CCAFS SLC 40 \n",
"8 9 2014-08-05 Falcon 9 4535.000000 GTO CCAFS SLC 40 \n",
"9 10 2014-09-07 Falcon 9 4428.000000 GTO CCAFS SLC 40 \n",
"\n",
" Outcome Flights GridFins Reused Legs LandingPad Block \\\n",
"0 None None 1 False False False NaN 1.0 \n",
"1 None None 1 False False False NaN 1.0 \n",
"2 None None 1 False False False NaN 1.0 \n",
"3 False Ocean 1 False False False NaN 1.0 \n",
"4 None None 1 False False False NaN 1.0 \n",
"5 None None 1 False False False NaN 1.0 \n",
"6 True Ocean 1 False False True NaN 1.0 \n",
"7 True Ocean 1 False False True NaN 1.0 \n",
"8 None None 1 False False False NaN 1.0 \n",
"9 None None 1 False False False NaN 1.0 \n",
"\n",
" ReusedCount Serial Longitude Latitude \n",
"0 0 B0003 -80.577366 28.561857 \n",
"1 0 B0005 -80.577366 28.561857 \n",
"2 0 B0007 -80.577366 28.561857 \n",
"3 0 B1003 -120.610829 34.632093 \n",
"4 0 B1004 -80.577366 28.561857 \n",
"5 0 B1005 -80.577366 28.561857 \n",
"6 0 B1006 -80.577366 28.561857 \n",
"7 0 B1007 -80.577366 28.561857 \n",
"8 0 B1008 -80.577366 28.561857 \n",
"9 0 B1011 -80.577366 28.561857 "
]
},
"execution_count": 27,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df=pd.read_csv(\"https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv\")\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"First I identified and calculated the percentage of missing values that are within each attribute"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"FlightNumber 0.000000\n",
"Date 0.000000\n",
"BoosterVersion 0.000000\n",
"PayloadMass 0.000000\n",
"Orbit 0.000000\n",
"LaunchSite 0.000000\n",
"Outcome 0.000000\n",
"Flights 0.000000\n",
"GridFins 0.000000\n",
"Reused 0.000000\n",
"Legs 0.000000\n",
"LandingPad 28.888889\n",
"Block 0.000000\n",
"ReusedCount 0.000000\n",
"Serial 0.000000\n",
"Longitude 0.000000\n",
"Latitude 0.000000\n",
"dtype: float64"
]
},
"execution_count": 28,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()/len(df)*100"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"FlightNumber int64\n",
"Date object\n",
"BoosterVersion object\n",
"PayloadMass float64\n",
"Orbit object\n",
"LaunchSite object\n",
"Outcome object\n",
"Flights int64\n",
"GridFins bool\n",
"Reused bool\n",
"Legs bool\n",
"LandingPad object\n",
"Block float64\n",
"ReusedCount int64\n",
"Serial object\n",
"Longitude float64\n",
"Latitude float64\n",
"dtype: object"
]
},
"execution_count": 29,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"# Figuring which columns are numerical and categorical:\n",
"\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Our dataset have information on different launch facilities, so first I needed to figure out the number of launches from each site."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"LaunchSite\n",
"CCAFS SLC 40 55\n",
"KSC LC 39A 22\n",
"VAFB SLC 4E 13\n",
"Name: count, dtype: int64"
]
},
"execution_count": 30,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"LaunchSiteCount = df[\"LaunchSite\"].value_counts()\n",
"LaunchSiteCount"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"For every launch there is a dedicated orbit, so next I found the number and occurence of each orbit type"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Orbit\n",
"GTO 27\n",
"ISS 21\n",
"VLEO 14\n",
"PO 9\n",
"LEO 7\n",
"SSO 5\n",
"MEO 3\n",
"ES-L1 1\n",
"HEO 1\n",
"SO 1\n",
"GEO 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 31,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"OrbitCount=df[\"Orbit\"].value_counts()\n",
"OrbitCount"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Next, I looked at how many different landing outcomes there were, and how frequently each occured. "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Outcome\n",
"True ASDS 41\n",
"None None 19\n",
"True RTLS 14\n",
"False ASDS 6\n",
"True Ocean 5\n",
"False Ocean 2\n",
"None ASDS 2\n",
"False RTLS 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 32,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"landing_outcomes = df[\"Outcome\"].value_counts()\n",
"landing_outcomes"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Taking a closer look at the landing outcomes, I needed to create a landing outcome label. \n",
"\n",
"The first step was identifying the keys for each respective outcome. \n",
"\n",
"\n",
"Then identifying all the outcomes where a landing wasnt achieved. \n",
"\n",
"Lastly, I assigned the landing outcome to be represented by the following; \n",
"\n",
"0=Failed Landing\n",
"\n",
"1=Successful Landing\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 True ASDS\n",
"1 None None\n",
"2 True RTLS\n",
"3 False ASDS\n",
"4 True Ocean\n",
"5 False Ocean\n",
"6 None ASDS\n",
"7 False RTLS\n"
]
}
],
"source": [
"# Finding the landing outcome keys\n",
"for i,outcome in enumerate(landing_outcomes.keys()):\n",
" print(i,outcome)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}"
]
},
"execution_count": 34,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"# Identifying the outcomes that resulted in landing failure\n",
"bad_outcomes=set(landing_outcomes.keys()[[1,3,5,6,7]])\n",
"bad_outcomes"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"# Assigning our landing outcome labels\n",
"landing_class = df['Outcome'].apply(lambda x:0 if x in bad_outcomes else 1)\n",
"\n",
"# Applying our new label to our dataframe\n",
"df['Class']=landing_class"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" FlightNumber | \n",
" Date | \n",
" BoosterVersion | \n",
" PayloadMass | \n",
" Orbit | \n",
" LaunchSite | \n",
" Outcome | \n",
" Flights | \n",
" GridFins | \n",
" Reused | \n",
" Legs | \n",
" LandingPad | \n",
" Block | \n",
" ReusedCount | \n",
" Serial | \n",
" Longitude | \n",
" Latitude | \n",
" Class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 2010-06-04 | \n",
" Falcon 9 | \n",
" 6104.959412 | \n",
" LEO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0003 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 2012-05-22 | \n",
" Falcon 9 | \n",
" 525.000000 | \n",
" LEO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0005 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 2013-03-01 | \n",
" Falcon 9 | \n",
" 677.000000 | \n",
" ISS | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B0007 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 2013-09-29 | \n",
" Falcon 9 | \n",
" 500.000000 | \n",
" PO | \n",
" VAFB SLC 4E | \n",
" False Ocean | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1003 | \n",
" -120.610829 | \n",
" 34.632093 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 2013-12-03 | \n",
" Falcon 9 | \n",
" 3170.000000 | \n",
" GTO | \n",
" CCAFS SLC 40 | \n",
" None None | \n",
" 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
" 0 | \n",
" B1004 | \n",
" -80.577366 | \n",
" 28.561857 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" FlightNumber Date BoosterVersion PayloadMass Orbit LaunchSite \\\n",
"0 1 2010-06-04 Falcon 9 6104.959412 LEO CCAFS SLC 40 \n",
"1 2 2012-05-22 Falcon 9 525.000000 LEO CCAFS SLC 40 \n",
"2 3 2013-03-01 Falcon 9 677.000000 ISS CCAFS SLC 40 \n",
"3 4 2013-09-29 Falcon 9 500.000000 PO VAFB SLC 4E \n",
"4 5 2013-12-03 Falcon 9 3170.000000 GTO CCAFS SLC 40 \n",
"\n",
" Outcome Flights GridFins Reused Legs LandingPad Block \\\n",
"0 None None 1 False False False NaN 1.0 \n",
"1 None None 1 False False False NaN 1.0 \n",
"2 None None 1 False False False NaN 1.0 \n",
"3 False Ocean 1 False False False NaN 1.0 \n",
"4 None None 1 False False False NaN 1.0 \n",
"\n",
" ReusedCount Serial Longitude Latitude Class \n",
"0 0 B0003 -80.577366 28.561857 0 \n",
"1 0 B0005 -80.577366 28.561857 0 \n",
"2 0 B0007 -80.577366 28.561857 0 \n",
"3 0 B1003 -120.610829 34.632093 0 \n",
"4 0 B1004 -80.577366 28.561857 0 "
]
},
"execution_count": 36,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"# Checking our updated dataframe \n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"Because the outcomes are 0=failure, and 1=success - I was able to determine the overall success rate"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.6666666666666666"
]
},
"execution_count": 37,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df[\"Class\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"# Lastly, I downnloaded the csv of the data \n",
"\n",
"df.to_csv(\"dataset_part_2.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"argv": [
"/usr/bin/python3",
"-m",
"ipykernel",
"--HistoryManager.enabled=False",
"--matplotlib=inline",
"-c",
"%config InlineBackend.figure_formats = set(['retina'])\nimport matplotlib; matplotlib.rcParams['figure.figsize'] = (12, 7)",
"-f",
"{connection_file}"
],
"display_name": "Python 3 (system-wide)",
"env": {
},
"language": "python",
"metadata": {
"cocalc": {
"description": "Python 3 programming language",
"priority": 100,
"url": "https://www.python.org/"
}
},
"name": "python3",
"resource_dir": "/ext/jupyter/kernels/python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}