diff --git a/lab7.py b/lab7.py new file mode 100644 index 0000000000000000000000000000000000000000..c5be41f69ce3615eaede964df8b7ef6411412437 --- /dev/null +++ b/lab7.py @@ -0,0 +1,180 @@ +"Läsa in textfil" +"Läsa in ordfrekvensdata" +"Skapa delmängder av ordfrekvensdatan som dictionaries baserat på ordets längd" + +"Kontrollera alla ord i texten mot det dictionary som innehåller ord av samma längd" +"Om ett ord inte finns i dictionariet, skapa en instans av SpellingWarning och lägg till denna instans i Report" +"Kör minimum_edit_distance() på varje ord i Report-instansen mot alla ord i dictionariet av ordets längd" +"Samla orden med kortast redigeringsavstånd och välj de 3 med högst frekvens" + +import time +import sys + +class WordFrequencyData: + + def load_freq_data(self, filepath): + #kolla om alla tecken är bokstäver?? + file = open(filepath, encoding='utf-8') + freq_data = {} + for line in file: + word_and_freq = line.split("\t") + word = word_and_freq[0] + freq = word_and_freq[1] + if not word.isalpha(): + word = word.rstrip(word[-1]) + if not word.isalpha(): + continue + if len(word) in freq_data: + freq_data[len(word)][word] = freq + else: + freq_data[len(word)] = {} + freq_data[len(word)][word] = freq + file.close() + return freq_data + + def read_text_lines(self, text): + text_to_read = open(text) + lines_in_text = text_to_read.readlines() + text_to_read.close() + return lines_in_text + + +class SpellingWarning: + def __init__(self, word, line_index, freq_data): + self.line_index = line_index + self.word = word + self.freq_data = freq_data + self.length = len(word) + + def minimum_edit_distance(self, word, existing_word): + """Returnera minimum edit distance för strängarna word och existing_word).""" + # byt värde på word och existing_word om word är den längre strängen + if self.length > len(existing_word): + word, existing_word = existing_word, word + distances = range(self.length + 1) + for index2, char2 in enumerate(existing_word): + new_distances = [index2+1] + for index1, char1 in enumerate(self.word): + if char1 == char2: + new_distances.append(distances[index1]) + else: + new_distances.append(1 + min((distances[index1], + distances[index1+1], + new_distances[-1]))) + distances = new_distances + return distances[-1] + + def get_word_suggestion(self, word): + word_suggestions_list = [] + index = 1 + all_dist_1 = 0 + word_suggestions_dictionary = {} + for existing_word in self.freq_data[word.length].keys(): + edit_distance = self.minimum_edit_distance(word, existing_word) + if len(word_suggestions_dictionary) < 3: + word_suggestions_dictionary[existing_word] = edit_distance + word_suggestions_dictionary = dict(sorted(word_suggestions_dictionary.items(), key=lambda item: item[1])) + else: + for key in word_suggestions_dictionary.keys(): + if word_suggestions_dictionary[key] == 1: + all_dist_1 += 1 + if index == 3: + wanted_word = key + wanted_key = word_suggestions_dictionary[wanted_word] + else: + index += 1 + if all_dist_1 == 3: + break + if edit_distance < wanted_key: + word_suggestions_dictionary.pop(wanted_word) + word_suggestions_dictionary[existing_word] = edit_distance + word_suggestions_dictionary = dict(sorted(word_suggestions_dictionary.items(), key=lambda item: item[1])) + index = 1 + all_dist_1 = 0 + for value in word_suggestions_dictionary.keys(): + word_suggestions_list.append(value) + return word_suggestions_list + + +class Report: + + def __init__(self, name): + self.spell_warnings = {} + self.name = name + "_report" + + def add_SpellingWarning(self, word): + self.spell_warnings[word] = word.get_word_suggestion(word) + + def __str__(self): + str = "" + for key in self.spell_warnings: + word_str = "" + for sug_word in self.spell_warnings[key]: + word_str += sug_word + ", " + str += (f"[line {key.line_index}] {key.word}: {word_str[:-2]} \n") + return str + + +class WordControl: + + def __init__(self, freq_data, text): + self.data = WordFrequencyData() + self.lines_in_text = self.data.read_text_lines(text) + self.freq_data = self.data.load_freq_data(freq_data) + self.text = text + self.report_name = text[:-4] + self.report = Report(self.report_name) + + def word_spell_check(self): + print(f"* Reading '{self.text}' and looking for unknown words...") + line_index = 1 + wrong_spelled_words = [] + for line in self.lines_in_text: + for words in line.split(): + word = words.lower() + #de ord med komma/punkt inkluderat har sista platsen i word-stringen + if not word.isalpha(): + word = word.rstrip(word[-1]) + if not word.isalpha(): + continue + if word not in self.freq_data[len(word)].keys(): + word = SpellingWarning(word, line_index, self.freq_data) + wrong_spelled_words.append(word) + line_index += 1 + self.create_report(wrong_spelled_words) + + def create_report(self, wrong_spelled_word_list: list): + for word in wrong_spelled_word_list: + self.report.add_SpellingWarning(word) + + def __str__(self): + return str(self.report) + + +def get_total_words_freq_data(filepath): + file = open(filepath) + words = 0 + for line in file: + words += 1 + return words + +def main(): + print(f"* Loading word frequency data from '{sys.argv[1]}'...") + print(f"* Frequency data for {get_total_words_freq_data(sys.argv[1])} words loaded.") + print(f"* {int(len(sys.argv)-2)} file(s) to check") + for i in range(2, len(sys.argv)): + time_start = time.time() + word_control = WordControl(sys.argv[1], sys.argv[i]) + word_control.word_spell_check() + print(f"* Found {len(word_control.report.spell_warnings)} unknown words.") + time_end = time.time() + total_time = round(time_end-time_start, 2) + print(f"* Saving report to {sys.argv[i]}") + text_file = open(f"/Users/mikael_naeslund/PYTHON/LABBAR/LAB7/{word_control.report.name}.txt", "w") + text_file.write(f"Spell check for '{sys.argv[i]}' took {total_time} seconds.\n\n" ) + text_file.write(word_control.__str__()) + text_file.close() + +if __name__ == "__main__": + main() +