CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

| Download

Needle man wunsch

Views: 22
Image: ubuntu2204
Kernel: Python 3 (system-wide)
import numpy as np #we import this library to create arrays. # Defining function for reading fasta file def read_fasta(filename): sequences = [] current_sequence = "" with open(filename, 'r') as file: for line in file: line = line.strip() if line.startswith(">"): if current_sequence: sequences.append(current_sequence) current_sequence = "" else: current_sequence += line if current_sequence: sequences.append(current_sequence) return sequences # This line should be indented correctly inside the function #the above function can be used for reading fasta file # Defining function for reading blosum matrix def read_blosum(filename): blosum = {} with open(filename, 'r') as file: amino_acids = None for line in file: line = line.strip() if line.startswith("#"): # Skip comments (lines starting with '#') continue if not amino_acids: # The first line after comments is expected to be the header line. amino_acids = line.split() else: values = line.split() amino_acid = values[0] scores = [int(val) for val in values[1:]] blosum[amino_acid] = {aa: score for aa, score in zip(amino_acids, scores)} return blosum def needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty): len_seq1 = len(seq1) len_seq2 = len(seq2) # Initialize the scoring matrix scoring_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int) # Initialize the traceback matrix traceback_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int) # Fill in the scoring matrix for i in range(1, len_seq1 + 1): for j in range(1, len_seq2 + 1): match = scoring_matrix[i-1][j-1] + blosum_matrix[seq1[i-1]][seq2[j-1]] delete = scoring_matrix[i-1][j] + gap_penalty insert = scoring_matrix[i][j-1] + gap_penalty scoring_matrix[i][j] = max(match, delete, insert) if scoring_matrix[i][j] == match: traceback_matrix[i][j] = 0 # Match this when two base are same elif scoring_matrix[i][j] == delete: traceback_matrix[i][j] = 1 # Delete else: traceback_matrix[i][j] = 2 # Insert # Traceback to find the aligned sequences align1 = [] align2 = [] i, j = len_seq1, len_seq2 while i > 0 or j > 0: if traceback_matrix[i][j] == 0: # Match align1.append(seq1[i-1]) align2.append(seq2[j-1]) i -= 1 j -= 1 elif traceback_matrix[i][j] == 1: # Delete align1.append(seq1[i-1]) align2.append('-') i -= 1 else: # Insert align1.append('-') align2.append(seq2[j-1]) j -= 1 aligned_seq1 = ''.join(align1[::-1]) aligned_seq2 = ''.join(align2[::-1]) return aligned_seq1, aligned_seq2, scoring_matrix[len_seq1][len_seq2] for i in range(len(sequences)): for j in range(i + 1, len(sequences)): seq1, seq2 = sequences[i], sequences[j] aligned_seq1, aligned_seq2, score = needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty) print(f"Alignment between Sequence {i + 1} and Sequence {j + 1} (Score: {score}):") print("Sequence 1:", aligned_seq1) print("Sequence 2:", aligned_seq2) print()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) /tmp/ipykernel_615/65219433.py in <cell line: 92>() 90 91 ---> 92 for i in range(len(sequences)): 93 for j in range(i + 1, len(sequences)): 94 seq1, seq2 = sequences[i], sequences[j] NameError: name 'sequences' is not defined