As mentioned during my one of Professor Pister’s group meetings.. you can treat language as an MDP. So given a sample text, you obtain the probabilities of each word following its prior, and sample those distributions to get mildly coherent text.

Given Toobin’s Money Unlimited, and asking for 8 words, gives you the following: “some have asserted that viewers should have known”;

``````# (c) 2010 Leo Keselman New BSD
#!/usr/bin/python

import sys
import random

class Counter(dict):
def __getitem__(self, idx):
self.setdefault(idx, 0)
return dict.__getitem__(self, idx)
def normalize(self):
total = float(sum(self.values()))
if total == 0: return
for key in self.keys():
self[key] = self[key] / total

input = open(sys.argv)

bank = {}
word = ""
prevword = ""
delim = ['/','-',' ','.',',','?','!',';',':','"',')','(','&']
punct = [x for x in delim if x not in [' ',',']]

def update_bank(db,key,val):
key = key.lower()
val = val.lower()
if not db.has_key(key):
db[key] = Counter()
db[key][val] += 1
def sample(dist):
r = random.random()
base = 0.0
for element, prob in dist:
base += prob
if r:
return element

for ch in input.read():
if ch == '\n':
continue
if ch not in delim:
word += ch
else:
if ch in punct:
update_bank(bank,prevword,word)
update_bank(bank,word,ch)
prevword = ch
word=""
else:
update_bank(bank,prevword,word)
prevword = word
word = ""

for word in bank.values():
word.normalize()
output = open('gen_txt','w')

word = '.'

for i in range(int(sys.argv)):
next_w = sample(bank[word].items())
if next_w == '.':
output.write(".\n")
else:
output.write(next_w)
output.write(' ')
word = next_w
``````