Module dictionary
This module allows you to manage input and output files and process the dictionary.
Expand source code
"""
## This module allows you to **manage** input and output files and **process** the dictionary.
"""
from statistics import quantiles
from fractions import Fraction
from sys import maxsize
####################################
# Input and output file management #
####################################
def open_alphabet (filename):
"""
`open_alphabet()` gets the input alphabet from a file.
* **filename** (*str*): the name of the file to open (`read` mode)
* **return** (*list*): the input alphabet
"""
f = open(filename, "r")
alphabet = f.read().split(' ')
f.close()
while '\n' in alphabet:
alphabet.remove('\n')
alphabet.insert(0, '')
return alphabet
def open_dictionaries(filenames):
"""
`open_dictionary()` gets the input dictionary from a file.
* **filenames** (*list*): the list of names of files to open (`read` mode)
* **return** (*list*): the input dictionary
"""
for filename in filenames:
f = open(filename, "r")
dictionary = f.read().split('\n')
f.close()
to_del = []
for word in dictionary:
if word.startswith('#') or word == '' or word == ' ':
to_del.append(word)
for word in to_del:
dictionary.remove(word)
return dictionary
def get_alphabet_from_dict(dictionary):
"""
`get_alphabet_from_dict()` gets the alphabet from the dictionary by adding each used letter.
* **dictionary** (*list*): the input dictionary (before processing)
* **return** (*list*): the alphabet based on the letters used in the dictionary
"""
alphabet = []
for word in dictionary:
for letter in word:
if not letter in alphabet:
alphabet.append(letter)
alphabet = list(sorted(set(alphabet)))
alphabet.insert(0, '')
return alphabet
def write_clean_dictionary(dictionary, filename):
"""
`write_clean_dictionary()` writes the processed dictionary in a file.
* **dictionary** (*list*): the input dictionary (after processing)
* **filename** (*str*): the name of the file to open (`write` mode)
* **return** (*None*)
"""
f = open(filename, 'w')
for word in dictionary:
f.write(word+'\n')
f.close()
def write_generated_words(word_list, filename):
"""
`write_generated_words()` writes the list of generated words in a file.
* **word_list** (*str*): the string that contain all generated words
* **filename** (*str*): the name of the file to open (`write` mode)
* **return** (*None*)
"""
f = open(filename, 'w')
f.write(word_list)
f.close()
#########################
# Dictionary processing #
#########################
def process_dictionary (dictionary):
"""
`process_dictionary()` sorts the dictionary and removes duplicated words.
* **dictionary** (*list*): the input dictionary (while processing)
* **return** (*list*): the sorted dictionary without duplicated words
"""
return sorted(set(dictionary))
def get_length_range(dictionary, percent):
"""
`get_length_range()` gets the minimum and maximum size values for which a percentage of words in dictionary lie between them.
* **dictionary** (*list*): the input dictionary (while processing)
* **percent** (*int*): the percentage of words within the interval
* **return** (*list*): the minimum and maximum size values
"""
# p: percentage (0<p<1)
# n: number of quantiles
# o: offset on each side
dict_len = [len(word) for word in dictionary]
p=percent/100
(a,b)=Fraction(p).limit_denominator(1000).as_integer_ratio()
(o,n)=(b-a,2*b) if (b-a)%2 == 1 else ((b-a)//2,b)
return (quantiles(dict_len, n=n)[o-1], quantiles(dict_len, n=n)[-o])
def process_size(size):
"""
`process_size()` converts the size argument to a min_len and max_len couple (tuple length two).
* **size** (*str*): the input argument value
* **return** (*couple*): the minimum and maximum size values
"""
min_len = 0
max_len = maxsize
if ':' == size:
pass
elif not ':' in size:
min_len = max(int(size), min_len)
max_len = min(int(size), max_len)
elif size.startswith(':'):
max_len = int(size[1:])
elif size.endswith(':'):
min_len = int(size[:-1])
else:
min_len = max(int(size.split(':')[0]), min_len)
max_len = min(int(size.split(':')[-1]), max_len)
if max_len < min_len:
raise Exception(f"size value error: {min_len} is greater than {max_len}")
return (min_len, max_len)
def remove_missing_letters(dictionary, missing_letters):
"""
`get_missing_letters()` removes from the dictionary every word that uses at least one letter that is not in the alphabet.
* **dictionary** (*list*): the input dictionary (while processing)
* **missing_letters** (*list*): letters used in the dictionary that are not in the alphabet
* **return** (*list*): the dictionary without any word that contain one word from **missing_letters**
"""
words_to_del = []
for letter in missing_letters:
for word in dictionary:
if letter in word:
words_to_del.append(word)
words_to_del = set(words_to_del)
for word in words_to_del:
dictionary.remove(word)
return dictionary
def get_missing_letters (dictionary, alphabet):
"""
`get_missing_letters()` gets every word from the dictionary that uses at least one letter that is not in the alphabet.
* **dictionary** (*list*): the input dictionary (while processing)
* **alphabet** (*list*): the used alphabet (from input file or from dictionary)
* **return** (*list*): the list of letters used at least once in the dictionary that are not in the alphabet
"""
missing_letter = []
for word in dictionary:
for letter in word:
if not letter in alphabet and not letter in missing_letter:
missing_letter.append(letter)
return (missing_letter)
def print_plural_words (dictionary, lang):
"""
`print_plural_words()` prints every word from the dictionary that is already in the dictionary in singular form.
* **dictionary** (*list*): the input dictionary (while processing)
* **lang** (*str*): the language used to follow plural rules (only `FR` is available yet)
* **return** (*None*)
"""
if lang == 'fr':
for word in dictionary:
l = len(word)
if word[l-1] == 's'and word[:l-1] in dictionary \
or word[l-1] == 'x'and word[:l-1] in dictionary \
or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'al' in dictionary \
or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'ail' in dictionary:
print(word)
# insert here plural rules from other languages
def remove_plural_words (dictionary, lang):
"""
`remove_plural_words()` removes from the dictionary every word that is already in the dictionary in singular form.
* **dictionary** (*list*): the input dictionary (while processing)
* **lang** (*str*): the language used to follow plural rules (only `FR` is available yet)
* **return** (*list*): the dictionary without duplicated words in singular / plural forms
"""
words_to_del = []
if lang == 'fr':
for word in dictionary:
l = len(word)
if word[l-1] == 's'and word[:l-1] in dictionary \
or word[l-1] == 'x'and word[:l-1] in dictionary \
or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'al' in dictionary \
or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'ail' in dictionary:
words_to_del.append(word)
words_to_del = set(words_to_del)
for word in words_to_del:
dictionary.remove(word)
# HERE insert plural rules from other languages
return (dictionary)
def lower_case_words(dictionary):
"""
`lower_case_words()` lower-cases every word from the dictionary.
* **dictionary** (*list*): the input dictionary (while processing)
* **return** (*list*): the dictionary with each word lower-cased
"""
return ([word.lower() for word in dictionary])
def print_acronyms (dictionary):
"""
`print_acronyms()` prints every acronyms from the dictionary.
An acronyms is a word such as `word == word.upper()`.
* **dictionary** (*list*): the input dictionary (while processing)
* **return** (*None*)
"""
for word in dictionary:
if word == word.upper():
print(word)
def remove_acronyms (dictionary):
"""
`remove_acronyms()` removes every acronyms from the dictionary.
An acronyms is a word such as `word == word.upper()`.
* **dictionary** (*list*): the input dictionary (while processing)
* **return** (*list*): the dictionary without acronyms
"""
words_to_del = []
for word in dictionary:
if word == word.upper():
words_to_del.append(word)
words_to_del = set(words_to_del)
for word in words_to_del:
dictionary.remove(word)
return (dictionary)
Functions
def get_alphabet_from_dict(dictionary)
-
get_alphabet_from_dict()
gets the alphabet from the dictionary by adding each used letter.- dictionary (list): the input dictionary (before processing)
- return (list): the alphabet based on the letters used in the dictionary
Expand source code
def get_alphabet_from_dict(dictionary): """ `get_alphabet_from_dict()` gets the alphabet from the dictionary by adding each used letter. * **dictionary** (*list*): the input dictionary (before processing) * **return** (*list*): the alphabet based on the letters used in the dictionary """ alphabet = [] for word in dictionary: for letter in word: if not letter in alphabet: alphabet.append(letter) alphabet = list(sorted(set(alphabet))) alphabet.insert(0, '') return alphabet
def get_length_range(dictionary, percent)
-
get_length_range()
gets the minimum and maximum size values for which a percentage of words in dictionary lie between them.- dictionary (list): the input dictionary (while processing)
- percent (int): the percentage of words within the interval
- return (list): the minimum and maximum size values
Expand source code
def get_length_range(dictionary, percent): """ `get_length_range()` gets the minimum and maximum size values for which a percentage of words in dictionary lie between them. * **dictionary** (*list*): the input dictionary (while processing) * **percent** (*int*): the percentage of words within the interval * **return** (*list*): the minimum and maximum size values """ # p: percentage (0<p<1) # n: number of quantiles # o: offset on each side dict_len = [len(word) for word in dictionary] p=percent/100 (a,b)=Fraction(p).limit_denominator(1000).as_integer_ratio() (o,n)=(b-a,2*b) if (b-a)%2 == 1 else ((b-a)//2,b) return (quantiles(dict_len, n=n)[o-1], quantiles(dict_len, n=n)[-o])
def get_missing_letters(dictionary, alphabet)
-
get_missing_letters()
gets every word from the dictionary that uses at least one letter that is not in the alphabet.- dictionary (list): the input dictionary (while processing)
- alphabet (list): the used alphabet (from input file or from dictionary)
- return (list): the list of letters used at least once in the dictionary that are not in the alphabet
Expand source code
def get_missing_letters (dictionary, alphabet): """ `get_missing_letters()` gets every word from the dictionary that uses at least one letter that is not in the alphabet. * **dictionary** (*list*): the input dictionary (while processing) * **alphabet** (*list*): the used alphabet (from input file or from dictionary) * **return** (*list*): the list of letters used at least once in the dictionary that are not in the alphabet """ missing_letter = [] for word in dictionary: for letter in word: if not letter in alphabet and not letter in missing_letter: missing_letter.append(letter) return (missing_letter)
def lower_case_words(dictionary)
-
lower_case_words()
lower-cases every word from the dictionary.- dictionary (list): the input dictionary (while processing)
- return (list): the dictionary with each word lower-cased
Expand source code
def lower_case_words(dictionary): """ `lower_case_words()` lower-cases every word from the dictionary. * **dictionary** (*list*): the input dictionary (while processing) * **return** (*list*): the dictionary with each word lower-cased """ return ([word.lower() for word in dictionary])
def open_alphabet(filename)
-
open_alphabet()
gets the input alphabet from a file.- filename (str): the name of the file to open (
read
mode) - return (list): the input alphabet
Expand source code
def open_alphabet (filename): """ `open_alphabet()` gets the input alphabet from a file. * **filename** (*str*): the name of the file to open (`read` mode) * **return** (*list*): the input alphabet """ f = open(filename, "r") alphabet = f.read().split(' ') f.close() while '\n' in alphabet: alphabet.remove('\n') alphabet.insert(0, '') return alphabet
- filename (str): the name of the file to open (
def open_dictionaries(filenames)
-
open_dictionary()
gets the input dictionary from a file.- filenames (list): the list of names of files to open (
read
mode) - return (list): the input dictionary
Expand source code
def open_dictionaries(filenames): """ `open_dictionary()` gets the input dictionary from a file. * **filenames** (*list*): the list of names of files to open (`read` mode) * **return** (*list*): the input dictionary """ for filename in filenames: f = open(filename, "r") dictionary = f.read().split('\n') f.close() to_del = [] for word in dictionary: if word.startswith('#') or word == '' or word == ' ': to_del.append(word) for word in to_del: dictionary.remove(word) return dictionary
- filenames (list): the list of names of files to open (
def print_acronyms(dictionary)
-
print_acronyms()
prints every acronyms from the dictionary. An acronyms is a word such asword == word.upper()
.- dictionary (list): the input dictionary (while processing)
- return (None)
Expand source code
def print_acronyms (dictionary): """ `print_acronyms()` prints every acronyms from the dictionary. An acronyms is a word such as `word == word.upper()`. * **dictionary** (*list*): the input dictionary (while processing) * **return** (*None*) """ for word in dictionary: if word == word.upper(): print(word)
def print_plural_words(dictionary, lang)
-
print_plural_words()
prints every word from the dictionary that is already in the dictionary in singular form.- dictionary (list): the input dictionary (while processing)
- lang (str): the language used to follow plural rules (only
FR
is available yet) - return (None)
Expand source code
def print_plural_words (dictionary, lang): """ `print_plural_words()` prints every word from the dictionary that is already in the dictionary in singular form. * **dictionary** (*list*): the input dictionary (while processing) * **lang** (*str*): the language used to follow plural rules (only `FR` is available yet) * **return** (*None*) """ if lang == 'fr': for word in dictionary: l = len(word) if word[l-1] == 's'and word[:l-1] in dictionary \ or word[l-1] == 'x'and word[:l-1] in dictionary \ or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'al' in dictionary \ or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'ail' in dictionary: print(word) # insert here plural rules from other languages
def process_dictionary(dictionary)
-
process_dictionary()
sorts the dictionary and removes duplicated words.- dictionary (list): the input dictionary (while processing)
- return (list): the sorted dictionary without duplicated words
Expand source code
def process_dictionary (dictionary): """ `process_dictionary()` sorts the dictionary and removes duplicated words. * **dictionary** (*list*): the input dictionary (while processing) * **return** (*list*): the sorted dictionary without duplicated words """ return sorted(set(dictionary))
def process_size(size)
-
process_size()
converts the size argument to a min_len and max_len couple (tuple length two).- size (str): the input argument value
- return (couple): the minimum and maximum size values
Expand source code
def process_size(size): """ `process_size()` converts the size argument to a min_len and max_len couple (tuple length two). * **size** (*str*): the input argument value * **return** (*couple*): the minimum and maximum size values """ min_len = 0 max_len = maxsize if ':' == size: pass elif not ':' in size: min_len = max(int(size), min_len) max_len = min(int(size), max_len) elif size.startswith(':'): max_len = int(size[1:]) elif size.endswith(':'): min_len = int(size[:-1]) else: min_len = max(int(size.split(':')[0]), min_len) max_len = min(int(size.split(':')[-1]), max_len) if max_len < min_len: raise Exception(f"size value error: {min_len} is greater than {max_len}") return (min_len, max_len)
def remove_acronyms(dictionary)
-
remove_acronyms()
removes every acronyms from the dictionary. An acronyms is a word such asword == word.upper()
.- dictionary (list): the input dictionary (while processing)
- return (list): the dictionary without acronyms
Expand source code
def remove_acronyms (dictionary): """ `remove_acronyms()` removes every acronyms from the dictionary. An acronyms is a word such as `word == word.upper()`. * **dictionary** (*list*): the input dictionary (while processing) * **return** (*list*): the dictionary without acronyms """ words_to_del = [] for word in dictionary: if word == word.upper(): words_to_del.append(word) words_to_del = set(words_to_del) for word in words_to_del: dictionary.remove(word) return (dictionary)
def remove_missing_letters(dictionary, missing_letters)
-
get_missing_letters()
removes from the dictionary every word that uses at least one letter that is not in the alphabet.- dictionary (list): the input dictionary (while processing)
- missing_letters (list): letters used in the dictionary that are not in the alphabet
- return (list): the dictionary without any word that contain one word from missing_letters
Expand source code
def remove_missing_letters(dictionary, missing_letters): """ `get_missing_letters()` removes from the dictionary every word that uses at least one letter that is not in the alphabet. * **dictionary** (*list*): the input dictionary (while processing) * **missing_letters** (*list*): letters used in the dictionary that are not in the alphabet * **return** (*list*): the dictionary without any word that contain one word from **missing_letters** """ words_to_del = [] for letter in missing_letters: for word in dictionary: if letter in word: words_to_del.append(word) words_to_del = set(words_to_del) for word in words_to_del: dictionary.remove(word) return dictionary
def remove_plural_words(dictionary, lang)
-
remove_plural_words()
removes from the dictionary every word that is already in the dictionary in singular form.- dictionary (list): the input dictionary (while processing)
- lang (str): the language used to follow plural rules (only
FR
is available yet) - return (list): the dictionary without duplicated words in singular / plural forms
Expand source code
def remove_plural_words (dictionary, lang): """ `remove_plural_words()` removes from the dictionary every word that is already in the dictionary in singular form. * **dictionary** (*list*): the input dictionary (while processing) * **lang** (*str*): the language used to follow plural rules (only `FR` is available yet) * **return** (*list*): the dictionary without duplicated words in singular / plural forms """ words_to_del = [] if lang == 'fr': for word in dictionary: l = len(word) if word[l-1] == 's'and word[:l-1] in dictionary \ or word[l-1] == 'x'and word[:l-1] in dictionary \ or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'al' in dictionary \ or l > 3 and word[l-3:l] == 'aux'and word[:l-3]+'ail' in dictionary: words_to_del.append(word) words_to_del = set(words_to_del) for word in words_to_del: dictionary.remove(word) # HERE insert plural rules from other languages return (dictionary)
def write_clean_dictionary(dictionary, filename)
-
write_clean_dictionary()
writes the processed dictionary in a file.- dictionary (list): the input dictionary (after processing)
- filename (str): the name of the file to open (
write
mode) - return (None)
Expand source code
def write_clean_dictionary(dictionary, filename): """ `write_clean_dictionary()` writes the processed dictionary in a file. * **dictionary** (*list*): the input dictionary (after processing) * **filename** (*str*): the name of the file to open (`write` mode) * **return** (*None*) """ f = open(filename, 'w') for word in dictionary: f.write(word+'\n') f.close()
def write_generated_words(word_list, filename)
-
write_generated_words()
writes the list of generated words in a file.- word_list (str): the string that contain all generated words
- filename (str): the name of the file to open (
write
mode) - return (None)
Expand source code
def write_generated_words(word_list, filename): """ `write_generated_words()` writes the list of generated words in a file. * **word_list** (*str*): the string that contain all generated words * **filename** (*str*): the name of the file to open (`write` mode) * **return** (*None*) """ f = open(filename, 'w') f.write(word_list) f.close()