Dosto.py - lichtenvol

# Danielle Huser

# CMSC416 – Natural Language Processing

# 18 April 2024

# dosto.py

# this program reads integers n and s as well as a number of txt files from the command line

# and using n-grams from the text files, generates s number of sentences

# Algorithm:

# Accepts n, s, and training file names from the command line

# Enter all text into one string and clean (lowercase and fix punctuation)

# Separate all tokens by spaces into a list

# Create another list of all n-grams within each sentence of the full text

# Generate sentence by finding first words by randomly selecting an ngram that starts with a period, exclamation point, or question mark token (exclude first token)

# Find matching n-grams where the first n-1 words match the previous n-gram’s last n-1 words

# end the sentence when a period, exclamation point, or question mark is found.

# Print s sentences

# Example I/O:

# I used Dostoevsky’s Crime & Punishment, Notes From Underground, White Nights & Other Stories, The Idiot, and The Brothers Karamazov

# python3 dosto.py 4 50 cp.txt bk.txt nu.txt wn.txt id.txt

# as though i had not said a word yet ; he sat silent and listened to him and the promise i gave him a polite , even ceremonious , bow .

# no one had seen here before .

# he knows it and yet asks for guarantees .

# you’ll go to dounia , he laid before marfa petrovna came upon them in the right way at first .

# hm . . .

# five minutes passed .

# Instructions:

# Enter ngram number, number of sentences, and all filenames (may need complete path)

# At command line:

# $ python3 ngram.py n s filename1.txt filename2.txt filename3.txt

# $ python3 dosto.py 4 50 cp.txt bk.txt nu.txt wn.txt id.txt

from sys import argv

import random

# defining punctuation

punc = ‘!;:,.?”’

stops = ”'()[]{}<>/\’\”-@#$%^&*_~”’

# accepting the n-gram length and number of sentences from command line

n = int(argv[1])

s = int(argv[2])

allText = “”

# opening all of the files listed and reading them into a string

for txt in argv[3:]:

file = open(txt, “r”)

allText += file.read()

# replacing punctuation so they’ll split into their own tokens and removing excess symbols

# making all text lowercase for simplicity/reduce sparsity

allText = allText.lower()

for ele in stops:

allText = allText.replace(ele, “”)

for ele in punc:

allText = allText.replace(ele, ” ” + ele)

# splitting into list

all = allText.split()

# creating list of all unique n-grams from full text

gramList = []

for i, token in enumerate(all):

gram = []

try:

for j in range(n):

gram.append(all[i + j])

gramList.append(gram)

except:

break

# was testing probabilistic selection, but this makes the sentences less “original”

#gramUn = list(set(gramList))

#gramFr = []

#for gr in gramUn:

#gramFr.append(gramList.count(gr))

# creating sentences

for i in range(s):

endSentence = 0

sentence = “”

lastWord = []

# taking random choice for beginning of sentence based on n-grams that begin with a period or other sentence-ending char

first = random.choice([g for g in gramList if (g[0] == ‘.’ or g[0] == ‘!’ or g[0] == ‘?’) and g[1] != ‘.’])

for k in range(n):

if k > 0:

sentence += (first[k] + ” “)

# keeping track of last words and adding on another random ngram with a match

lastWord = first[1:]

while endSentence == 0:

nextGram = random.choice([gr for gr in gramList if gr[:-1] == lastWord])

sentence += (nextGram[n – 1] + ” “)

if nextGram[n – 1] == ‘.’ or nextGram[n – 1] == ‘!’ or nextGram[n – 1] == ‘?’:

endSentence = 1

break

lastWord = nextGram[1:]

#printing the final rentence

print(sentence + “\n”)