# Danielle Huser
# CMSC416 – Natural Language Processing
# 18 April 2024
# dosto.py
# this program reads integers n and s as well as a number of txt files from the command line
# and using n-grams from the text files, generates s number of sentences
# Algorithm:
# Accepts n, s, and training file names from the command line
# Enter all text into one string and clean (lowercase and fix punctuation)
# Separate all tokens by spaces into a list
# Create another list of all n-grams within each sentence of the full text
# Generate sentence by finding first words by randomly selecting an ngram that starts with a period, exclamation point, or question mark token (exclude first token)
# Find matching n-grams where the first n-1 words match the previous n-gram’s last n-1 words
# end the sentence when a period, exclamation point, or question mark is found.
# Print s sentences
# Example I/O:
# I used Dostoevsky’s Crime & Punishment, Notes From Underground, White Nights & Other Stories, The Idiot, and The Brothers Karamazov
#
# python3 dosto.py 4 50 cp.txt bk.txt nu.txt wn.txt id.txt
#
# as though i had not said a word yet ; he sat silent and listened to him and the promise i gave him a polite , even ceremonious , bow .
# no one had seen here before .
# he knows it and yet asks for guarantees .
# you’ll go to dounia , he laid before marfa petrovna came upon them in the right way at first .
# hm . . .
# five minutes passed .
# Instructions:
# Enter ngram number, number of sentences, and all filenames (may need complete path)
# At command line:
# $ python3 ngram.py n s filename1.txt filename2.txt filename3.txt
# $ python3 dosto.py 4 50 cp.txt bk.txt nu.txt wn.txt id.txt
from sys import argv
import random
# defining punctuation
punc = ‘!;:,.?”’
stops = ”'()[]{}<>/\’\”-@#$%^&*_~”’
# accepting the n-gram length and number of sentences from command line
n = int(argv[1])
s = int(argv[2])
allText = “”
# opening all of the files listed and reading them into a string
for txt in argv[3:]:
file = open(txt, “r”)
allText += file.read()
# replacing punctuation so they’ll split into their own tokens and removing excess symbols
# making all text lowercase for simplicity/reduce sparsity
allText = allText.lower()
for ele in stops:
allText = allText.replace(ele, “”)
for ele in punc:
allText = allText.replace(ele, ” ” + ele)
# splitting into list
all = allText.split()
# creating list of all unique n-grams from full text
gramList = []
for i, token in enumerate(all):
gram = []
try:
for j in range(n):
gram.append(all[i + j])
gramList.append(gram)
except:
break
# was testing probabilistic selection, but this makes the sentences less “original”
#gramUn = list(set(gramList))
#gramFr = []
#for gr in gramUn:
#gramFr.append(gramList.count(gr))
# creating sentences
for i in range(s):
endSentence = 0
sentence = “”
lastWord = []
# taking random choice for beginning of sentence based on n-grams that begin with a period or other sentence-ending char
first = random.choice([g for g in gramList if (g[0] == ‘.’ or g[0] == ‘!’ or g[0] == ‘?’) and g[1] != ‘.’])
for k in range(n):
if k > 0:
sentence += (first[k] + ” “)
# keeping track of last words and adding on another random ngram with a match
lastWord = first[1:]
while endSentence == 0:
nextGram = random.choice([gr for gr in gramList if gr[:-1] == lastWord])
sentence += (nextGram[n – 1] + ” “)
if nextGram[n – 1] == ‘.’ or nextGram[n – 1] == ‘!’ or nextGram[n – 1] == ‘?’:
endSentence = 1
break
lastWord = nextGram[1:]
#printing the final rentence
print(sentence + “\n”)