Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
431 views
ubuntu2004
1
def process_word(word):
2
remove = '.!?,"\'()*_:;0123456789'
3
word = word.strip()
4
for character in remove:
5
word = word.replace(character," ")
6
7
word = word.lower()
8
return word
9
#Close on this one but you needed to pass remove to the strip function, and it would do the work for you. As it was it added additional spaces which caused problems later on.
10
11
def process_word(word):
12
proc_word = word.strip('.!?,"\'()*_:;0123456789')
13
proc_word = proc_word.replace("'", '')
14
proc_word = proc_word.lower()
15
return proc_word
16
17
def process_line(line):
18
line = line.strip() #no need to do this since you've done it above.
19
line = line.replace("-"," ")
20
words = line.split()
21
processed_words = []
22
for word in words:
23
#very close on this, you needed to first get the length of the processed word and make sure it was > 0
24
pw = process_word(word)
25
if len(pw)>0: #then append
26
processed_words.append(pw)
27
#processed_words.append(process_word(word))
28
return processed_words
29
30
def process_file(path):
31
with open(path) as file:
32
words = []
33
data = file.readlines()
34
35
for line in data:
36
words = words + process_line(line)
37
return words
38
39
def find_unique(words):
40
unique_words = []
41
for word in words:
42
if word not in unique_words:
43
unique_words.append(word)
44
return unique_words
45
46
def find_frequency(words):
47
freq_dict = {}
48
for word in words:
49
if word in freq_dict:
50
freq_dict[word]+=1
51
else:
52
freq_dict[word]=1
53
return freq_dict
54
55
def remove_stop(words,stop):
56
non_stop_words = []
57
for word in words:
58
if word not in stop:
59
non_stop_words.append(word)
60
61
return non_stop_words
62
63
def most_common(freq_dict,n):
64
freq_list = []
65
for item in list(freq_dict.items()):
66
val = (item[1],item[0])
67
freq_list.append(val)
68
freq_list.sort(reverse=True)
69
print(f"{'Word':<12}{'Count':>4}")
70
print(("-"*16))
71
for index in range(0,n):
72
count = freq_list[index][0]
73
word = freq_list[index][1]
74
print(f"{word: <12}{count: >4}")
75
76
def count_by_length(words):
77
count_dict = {}
78
for word in words:
79
length = len(word)
80
if length in count_dict:
81
count_dict[length]+=1
82
else:
83
count_dict[length]=1
84
count_list = []
85
for item in list(count_dict.items()):
86
count_list.append(item)
87
88
count_list.sort(reverse=True)
89
print(f"{'Length':<12}{'Count':>4}")
90
print(("-"*16))
91
for item in count_list:
92
print(f"{item[0]: <12}{item[1]: >4}")
93
94
def count_by_first(words):
95
count_dict = {}
96
for word in words:
97
char= word[0]
98
if char in count_dict:
99
count_dict[char]+=1
100
else:
101
count_dict[char]=1
102
count_list = []
103
for item in list(count_dict.items()):
104
count_list.append(item)
105
106
count_list.sort(reverse=True)
107
print(f"{'Letter':<12}{'Count':>4}")
108
print(("-"*16))
109
for item in count_list:
110
print(f"{item[0]: <12}{item[1]: >4}")
111