CoCalc -- Needleman algorithim

⁹ views
ubuntu2204
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'sequences' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_375/2648947585.py\u001b[0m in \u001b[0;36m<cell line: 93>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     92\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 93\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     94\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msequences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m         \u001b[0mseq1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'sequences' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "# Defining function for reading fasta file\n",
    "def read_fasta(filename):\n",
    "    sequences = []\n",
    "    current_sequence = \"\"\n",
    "    with open(filename, 'r') as file:\n",
    "        for line in file:\n",
    "            line = line.strip()\n",
    "            if line.startswith(\">\"):\n",
    "                if current_sequence:\n",
    "                    sequences.append(current_sequence)\n",
    "                current_sequence = \"\"\n",
    "            else:\n",
    "                current_sequence += line\n",
    "        if current_sequence:\n",
    "            sequences.append(current_sequence)\n",
    "    return sequences  # This line should be indented correctly inside the function\n",
    "\n",
    "\n",
    "\n",
    "#the above function can be used for reading fasta file\n",
    "# Defining function for reading blosum matrix \n",
    "def read_blosum(filename):\n",
    "    blosum = {}\n",
    "    with open(filename, 'r') as file:\n",
    "        amino_acids = None\n",
    "        for line in file:\n",
    "            line = line.strip()\n",
    "            if line.startswith(\"#\"):\n",
    "                # Skip comments (lines starting with '#')\n",
    "                continue\n",
    "            if not amino_acids:\n",
    "                # The first line after comments is expected to be the header line.\n",
    "                amino_acids = line.split()\n",
    "            else:\n",
    "                values = line.split()\n",
    "                amino_acid = values[0]\n",
    "                scores = [int(val) for val in values[1:]]\n",
    "                blosum[amino_acid] = {aa: score for aa, score in zip(amino_acids, scores)}\n",
    "    return blosum\n",
    "\n",
    "def needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty):\n",
    "    len_seq1 = len(seq1)\n",
    "    len_seq2 = len(seq2)\n",
    "    \n",
    "    # Initialize the scoring matrix\n",
    "    scoring_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int)\n",
    "    \n",
    "    # Initialize the traceback matrix\n",
    "    traceback_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int)\n",
    "\n",
    "    # Fill in the scoring matrix\n",
    "    for i in range(1, len_seq1 + 1):\n",
    "        for j in range(1, len_seq2 + 1):\n",
    "            match = scoring_matrix[i-1][j-1] + blosum_matrix[seq1[i-1]][seq2[j-1]]\n",
    "            delete = scoring_matrix[i-1][j] + gap_penalty\n",
    "            insert = scoring_matrix[i][j-1] + gap_penalty\n",
    "            scoring_matrix[i][j] = max(match, delete, insert)\n",
    "            \n",
    "            if scoring_matrix[i][j] == match:\n",
    "                traceback_matrix[i][j] = 0  # Match\n",
    "            elif scoring_matrix[i][j] == delete:\n",
    "                traceback_matrix[i][j] = 1  # Delete\n",
    "            else:\n",
    "                traceback_matrix[i][j] = 2  # Insert\n",
    "\n",
    "    # Traceback to find the aligned sequences\n",
    "    align1 = []\n",
    "    align2 = []\n",
    "    i, j = len_seq1, len_seq2\n",
    "    while i > 0 or j > 0:\n",
    "        if traceback_matrix[i][j] == 0:  # Match\n",
    "            align1.append(seq1[i-1])\n",
    "            align2.append(seq2[j-1])\n",
    "            i -= 1\n",
    "            j -= 1\n",
    "        elif traceback_matrix[i][j] == 1:  # Delete\n",
    "            align1.append(seq1[i-1])\n",
    "            align2.append('-')\n",
    "            i -= 1\n",
    "        else:  # Insert\n",
    "            align1.append('-')\n",
    "            align2.append(seq2[j-1])\n",
    "            j -= 1\n",
    "\n",
    "    aligned_seq1 = ''.join(align1[::-1])\n",
    "    aligned_seq2 = ''.join(align2[::-1])\n",
    "    \n",
    "    return aligned_seq1, aligned_seq2, scoring_matrix[len_seq1][len_seq2]\n",
    "\n",
    "\n",
    "for i in range(len(sequences)):\n",
    "    for j in range(i + 1, len(sequences)):\n",
    "        seq1, seq2 = sequences[i], sequences[j]\n",
    "        aligned_seq1, aligned_seq2, score = needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty)\n",
    "        print(f\"Alignment between Sequence {i + 1} and Sequence {j + 1} (Score: {score}):\")\n",
    "        print(\"Sequence 1:\", aligned_seq1)\n",
    "        print(\"Sequence 2:\", aligned_seq2)\n",
    "        print()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "argv": [
    "/usr/bin/python3",
    "-m",
    "ipykernel",
    "--HistoryManager.enabled=False",
    "--matplotlib=inline",
    "-c",
    "%config InlineBackend.figure_formats = set(['retina'])\nimport matplotlib; matplotlib.rcParams['figure.figsize'] = (12, 7)",
    "-f",
    "{connection_file}"
   ],
   "display_name": "Python 3 (system-wide)",
   "env": {
   },
   "language": "python",
   "metadata": {
    "cocalc": {
     "description": "Python 3 programming language",
     "priority": 100,
     "url": "https://www.python.org/"
    }
   },
   "name": "python3",
   "resource_dir": "/ext/jupyter/kernels/python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
Product

Resources

Company