CoCalc -- Needleman wunsch algorithim.ipynb

Needle man wunsch

²⁴ views
ubuntu2204

Kernel: Python 3 (system-wide)

In [2]:



import numpy as np #we import this library to create arrays.

# Defining function for reading fasta file
def read_fasta(filename):
    sequences = []
    current_sequence = ""
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if current_sequence:
                    sequences.append(current_sequence)
                current_sequence = ""
            else:
                current_sequence += line
        if current_sequence:
            sequences.append(current_sequence)
    return sequences  # This line should be indented correctly inside the function


#the above function can be used for reading fasta file
# Defining function for reading blosum matrix 
def read_blosum(filename):
    blosum = {}
    with open(filename, 'r') as file:
        amino_acids = None
        for line in file:
            line = line.strip()
            if line.startswith("#"):
                # Skip comments (lines starting with '#')
                continue
            if not amino_acids:
                # The first line after comments is expected to be the header line.
                amino_acids = line.split()
            else:
                values = line.split()
                amino_acid = values[0]
                scores = [int(val) for val in values[1:]]
                blosum[amino_acid] = {aa: score for aa, score in zip(amino_acids, scores)}
    return blosum

def needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty):
    len_seq1 = len(seq1)
    len_seq2 = len(seq2)
    
    # Initialize the scoring matrix
    scoring_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int)
    
    # Initialize the traceback matrix
    traceback_matrix = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int)

    # Fill in the scoring matrix
    for i in range(1, len_seq1 + 1):
        for j in range(1, len_seq2 + 1):
            match = scoring_matrix[i-1][j-1] + blosum_matrix[seq1[i-1]][seq2[j-1]]
            delete = scoring_matrix[i-1][j] + gap_penalty
            insert = scoring_matrix[i][j-1] + gap_penalty
            scoring_matrix[i][j] = max(match, delete, insert)
            
            if scoring_matrix[i][j] == match:
                traceback_matrix[i][j] = 0  # Match this when two base are same 
            elif scoring_matrix[i][j] == delete:
                traceback_matrix[i][j] = 1  # Delete
            else:
                traceback_matrix[i][j] = 2  # Insert

    # Traceback to find the aligned sequences
    align1 = []
    align2 = []
    i, j = len_seq1, len_seq2
    while i > 0 or j > 0:
        if traceback_matrix[i][j] == 0:  # Match
            align1.append(seq1[i-1])
            align2.append(seq2[j-1])
            i -= 1
            j -= 1
        elif traceback_matrix[i][j] == 1:  # Delete
            align1.append(seq1[i-1])
            align2.append('-')
            i -= 1
        else:  # Insert
            align1.append('-')
            align2.append(seq2[j-1])
            j -= 1

    aligned_seq1 = ''.join(align1[::-1])
    aligned_seq2 = ''.join(align2[::-1])
    
    return aligned_seq1, aligned_seq2, scoring_matrix[len_seq1][len_seq2]


for i in range(len(sequences)):
    for j in range(i + 1, len(sequences)):
        seq1, seq2 = sequences[i], sequences[j]
        aligned_seq1, aligned_seq2, score = needleman_wunsch(seq1, seq2, blosum_matrix, gap_penalty)
        print(f"Alignment between Sequence {i + 1} and Sequence {j + 1} (Score: {score}):")
        print("Sequence 1:", aligned_seq1)
        print("Sequence 2:", aligned_seq2)
        print()

Out[2]:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
/tmp/ipykernel_615/65219433.py in <cell line: 92>()
     90 
     91 
---> 92 for i in range(len(sequences)):
     93     for j in range(i + 1, len(sequences)):
     94         seq1, seq2 = sequences[i], sequences[j]
NameError: name 'sequences' is not defined

In [0]:

Product

Resources

Company