#!/usr/bin/ruby
# ghoti v1.03
# 20030729
# http://d.hatena.ne.jp/svnseeds/
require 'MeCab'
require 'jcode'
$KCODE = "e"
srand
module Ghoti
def Ghoti.new(*args)
Ghoti::Core.new(*args)
end
class Core
DATFILE_NAME = 'ghoti.dat'
DEBUG = nil # nil or 1
QUEUE = 20
def initialize(dir_name)
@markov_pre = Hash.new("") # trigram、1個目特定用 {2}{3}[1]
@markov_post = Hash.new("") # trigram、3個目特定用 {2}{1}[3]
@keywords = [] # [n]番目のsentenceに出現しているある{品詞}のkeywordとなる[単語]
@keywords_recent = [] # 直近QUEUE個分の、人間/Ghoti双方の発言で使われたある{品詞}のkeywordsとなる[単語]
@keyword_location = Hash.new("") # ある{品詞}のkeywordとなる{単語}が[l,m,n,...]番目のsentenceに出現すると言う情報
@dirname = dir_name
@datfile = "#{dir_name}/#{DATFILE_NAME}"
@mecab = MeCab::Tagger.new([$0])
load_dat
end
def load_dat
# datファイルの読込み
# datファイルは1発言1行のただのテキストファイル
# datファイルが存在しなければ何もしない
# 1行読み込み、parse_sentenceを呼ぶ
# 戻り値なし
if File.readable?(@datfile)
f = File.open(@datfile)
while line = f.gets()
parse_and_generate(line)
end
f.close
end
end
def parse_and_generate(str)
words = parse_sentence(str)
generate_markovchain(words)
generate_keywords(words)
out_words(words)
end
def parse_sentence(str)
# 引数は1行分の文字列
# 半角・全角を整える
# MeCabで形態素解析
# 半端な品詞のaggregation
# 戻り値は[[pos,word],..]の配列words
words = []
str = normalize(str)
str = @mecab.parse(str)
str.each do |line|
if line =~ /^EOS/
words.push(["#","#"])
next
end
ary = line.chomp.split(/\t/)
pos = ary[1].split(/,/)
pos[0] = pos[0]+pos[1] if pos[1] != "\*"
pos[0] = pos[0]+pos[2] if pos[2] != "\*"
pos[0] = pos[0]+pos[3] if pos[3] != "\*"
pos[0] = pos[0]+pos[4] if pos[4] != "\*"
pos[0] = pos[0]+pos[5] if pos[5] != "\*"
# 未知語は強制的にサ変名詞扱い
pos[0] = "名詞サ変接続" if pos[0] == "未知語"
# 半端な品詞の単語を前の単語にくっつける
# ただし、前の単語が{名詞一般|名詞サ変接続|名詞固有名詞|記号句点}の場合を除く
if pos[0] =~ /^助詞接続助詞/ ||\
pos[0] =~ /^助詞終助詞/ ||\
pos[0] =~ /^助詞副助詞/ ||\
pos[0] =~ /^助詞並立助詞/ ||\
pos[0] =~ /^助動詞不変化型基本形/ ||\
pos[0] =~ /^動詞非自立/ ||\
pos[0] =~ /^動詞接尾/ ||\
pos[0] =~ /^形容詞非自立/ ||\
pos[0] =~ /^形容詞接尾/ ||\
pos[0] =~ /^名詞非自立/
if words[words.size-1] == nil
elsif words[words.size-1][0] =~ /^名詞一般/ ||\
words[words.size-1][0] =~ /^名詞サ変接続/ ||\
words[words.size-1][0] =~ /^名詞固有名詞/ ||\
words[words.size-1][0] =~ /^記号句点/
else
words[words.size-1][0] += pos[0]
words[words.size-1][1] += ary[0]
next
end
end
words.push([pos[0],ary[0]])
end
words.unshift(["#","#"])
return words
end
def normalize(s)
s.chomp!
# s.tr!("a-z", "a-z") # ChaSenと違い、MeCabは半角アルファベットが好みらしい
s.tr!("a-z", "a-z")
# s.tr!("A-Z", "A-Z")
s.tr!("A-Z", "A-Z")
s.tr!("0-9", "0-9")
# s.tr!("0-9", "0-9")
s.tr!(" ", " ")
s.tr!("\!", "!")
s.tr!("\"", "”")
s.tr!("\#", "#")
s.tr!("\$", "$")
s.tr!("\%", "%")
s.tr!("\&", "&")
s.tr!("\'", "’")
s.tr!("\(", "(")
s.tr!("\)", ")")
s.tr!("\*", "*")
s.tr!("\+", "+")
s.tr!("\,", ",")
s.tr!("\-", "−")
s.tr!("\.", ".")
s.tr!("\/", "/")
s.tr!("\:", ":")
s.tr!("\;", ";")
s.tr!("\<", "<")
s.tr!("\=", "=")
s.tr!("\>", ">")
s.tr!("\?", "?")
s.tr!("\@", "@")
s.tr!("\]", "]")
s.tr!("\\", "¥")
s.tr!("\]", "]")
s.gsub!('\^', '^')
s.tr!("\_", "_")
s.tr!("\`", "‘")
s.tr!("\{", "{")
s.tr!("\|", "|")
s.tr!("\}", "}")
s.tr!("\~", "〜")
s.gsub!("[\s ]+"," ")
s.gsub!("\s+"," ")
s.gsub!(" +","")
s.gsub!("^\n","")
return s
end
def generate_markovchain(w)
# wordsのまま、丸ごとtrigram-markovの生成
i = 0
while i < w.size
if w[i+2] != nil
@markov_pre[w[i+1]] = Hash.new("") if @markov_pre[w[i+1]].empty?
@markov_pre[w[i+1]][w[i+2]] = [] if @markov_pre[w[i+1]][w[i+2]].empty?
@markov_pre[w[i+1]][w[i+2]].push(w[i])
@markov_post[w[i+1]] = Hash.new("") if @markov_post[w[i+1]].empty?
@markov_post[w[i+1]][w[i]] = [] if @markov_post[w[i+1]][w[i]].empty?
@markov_post[w[i+1]][w[i]].push(w[i+2])
end
i += 1
end
end
def generate_keywords(w)
# keywordsの抽出
# keywordsとなる単語が無かった場合、[["#","#"]]をしまう
kw = []
w.each do |line|
if line[0] =~ /^名詞一般/ ||\
line[0] =~ /^名詞サ変接続/ ||\
line[0] =~ /^名詞固有名詞/ ||\
line[0] =~ /^感動詞/
kw.push(line)
end
end
if kw.empty?
@keywords.push([["#","#"]])
else
@keywords.push(kw)
c = @keywords.size-1
kw.each do |k|
@keyword_location[k] = [] if @keyword_location[k].empty?
@keyword_location[k].push(c)
end
end
end
def memorize(str)
# strを記憶させて、markovchain生成、keywords生成
debugp "memorize"
parse_and_generate(str)
stuff_keywords
write_dat(str)
end
def write_dat(s)
f = File.open(@datfile,"a")
f.puts(s)
f.close
end
def stuff_keywords
# @keyrwords_recentへ詰める
@keywords[@keywords.size-1].each do |k|
@keywords_recent.push(k) if k != ["#","#"]
if @keywords_recent.size > QUEUE
@keywords_recent.shift
end
end
end
def talk(str=nil)
# strがあればそれを元に発言を生成
# strが無ければ、最近の会話を元に発言を生成
seedwords = []
keyword = []
if str
parse_and_generate(str)
stuff_keywords
end
debugp("talk")
debugp("@keywords_recent = ",@keywords_recent)
seedwords = get_seedwords
debugp("seedwords = ",seedwords)
keyword = get_relative_keyword(seedwords)
debugp("keyword = ",keyword)
if keyword != []
@keywords_recent.push(keyword)
if @keywords_recent.size > QUEUE
@keywords_recent.shift
end
end
debugp("@keywords_recent = ",@keywords_recent)
debugp("last chunk of @keywords = ",@keywords[@keywords.size-1])
msg = generate_sentence(keyword)
return msg.join
end
def get_seedwords
# @keywordsの最後の発言分からkeywordsを取得して返す
# @keywordsの最後の発言分が[["#","#"]]の場合、get_random_keywordsを呼ぶ
# その結果も[["#","#"]]の場合、素直に[["#","#"]]を返す
debugp("# get_seedwords")
debugp("## last chunk of @keywords = ",@keywords[@keywords.size-1])
if @keywords[@keywords.size-1] == [["#","#"]]
debugp("## last chunk of @keywords is '#' - go to get_random_keywords")
return get_random_keywords
else
debugp("## got valid keywords - return last words of @keywords")
return @keywords[@keywords.size-1]
end
end
def get_random_keywords
# @keywords_recentからランダムに単語を拾い、配列に収めて返す
# @keywords_recentが空の場合、[["#","#"]]を返す
debugp("# get_random_keyword")
if @keywords_recent.empty?
debugp("## @keywords_recent is empty - return #")
return [["#","#"]]
else
debugp("## @keywords_rencent is not empty - return 2 words from @keywords")
return [@keywords_recent[rand(@keywords_recent.size)],@keywords_recent[rand(@keywords_recent.size)]]
end
end
def get_relative_keyword(sw)
# seedwordsの中からランダムにひとつ単語を取り出し、それに関連した単語群からランダムにひとつの単語を返す
# seedwordsが[["#","#"]]の場合、get_random_wordを呼ぶ
debugp("# get_relative_keyword")
if sw == [["#","#"]]
debugp("## seedwords are empty - go to get_random_word")
return get_random_word
else
debugp("## seedwords are not empty - return a word from seedwords")
r=rand(sw.size)
return sw[r] if @keyword_location[sw[r]] == ""
loc = @keyword_location[sw[r]][rand(@keyword_location[sw[r]].size)]
return @keywords[loc][rand(@keywords[loc].size)]
end
end
def get_random_word
# ランダムに@markov_word_preから単語を選び、それを元に文を生成
# (datなしで起動、且つ今まで一度もkeywordsにヒットしなかった場合の処理)
debugp("# get_random_word")
# debugp("## @markov_pre = ",@markov_pre)
rw = @markov_pre.keys[rand(@markov_pre.keys.size)]
debugp("## random word = ",rw)
return rw
end
def generate_sentence(kw)
# keywordを元に、markovchainを使って文を生成して返す
debugp("# generate_sentence")
ary = []
ary.push(kw[1])
debugp("## ary = ",ary)
skw = @markov_pre[kw].keys[rand(@markov_pre[kw].keys.size)]
if skw != ["#","#"]
ary.push(skw[1])
end
# debugp("## ary = ",ary)
seedkw, seedskw = kw, skw
while (1)
pre = @markov_pre[seedkw][seedskw][rand(@markov_pre[seedkw][seedskw].size)]
break if pre == ["#","#"]
# 文が長くなりすぎるのを防ぐ
if pre == ["記号句点","。"] || pre == ["記号一般","?"]
break # if rand(2) == 0
end
ary.unshift(pre[1])
seedkw, seedskw = pre, seedkw
end
# debugp("## ary = ",ary)
if skw != ["#","#"]
seedkw, seedskw = kw, skw
while (1)
post = @markov_post[seedskw][seedkw][rand(@markov_post[seedskw][seedkw].size)]
break if post == ["#","#"]
# 文が長くなりすぎるのを防ぐ
if post == ["記号句点","。"] || post == ["記号一般","?"]
# if rand(2) == 0
ary.push(post[1])
break
# end
end
ary.push(post[1])
seedkw, seedskw = seedskw, post
end
end
# debugp("## ary = ",ary)
return ary
end
def out_markovchain
if DEBUG
f = File.open("#{@dirname}/out.markovchain","w")
f.puts "PRE(2,3,1):"
out_markovchain_do(f,@markov_pre)
f.puts "POST(2,1,3):"
out_markovchain_do(f,@markov_pre)
f.close
end
end
def out_markovchain_do(f,m)
m.keys.sort.each do |x|
m[x].keys.sort.each do |y|
f.puts "\t#{x[0]}/#{x[1]} + #{y[0]}/#{y[1]}:"
m[x][y].sort.each do |z|
f.puts "\t\t#{z[0]}/#{z[1]}"
end
end
end
end
def out_keywords
if DEBUG
f = File.open("#{@dirname}/out.keywords","w")
i = 0
while i < @keywords.size
f.puts i
@keywords[i].sort.each do |kw|
f.puts "\t#{kw[0]}/#{kw[1]}"
end
i += 1
end
f.close
end
end
def out_keyword_location
if DEBUG
f = File.open("#{@dirname}/out.keyword_location","w")
@keyword_location.keys.sort.each do |k|
f.print "#{k[0]}/#{k[1]}: "
@keyword_location[k].each do |n|
f.print n,","
end
f.print "\n"
end
f.close
end
end
def out_keywords_recent
if DEBUG
f = File.open("#{@dirname}/out.keywords_recent","w")
@keywords_recent.each do |k|
f.puts "#{k[0]}/#{k[1]}"
end
f.close
end
end
def debugp(msg, val=nil)
if DEBUG
if val
print msg
p val
else
puts msg
end
end
end
def out_words(words)
if DEBUG
f = File.open("#{@dirname}/out.words","a")
words.each do |w|
f.puts "#{w[0]}/#{w[1]}"
end
f.close
end
end
end
end
if $0 == __FILE__
require 'getopts'
getopts('im')
if $OPT_i
require 'readline'
ghoti = Ghoti.new(ARGV[0])
while (str = Readline.readline("> ", true))
break if /^(exit|quit)?$/.match(str)
if $OPT_m
ghoti.memorize(str)
msg = ghoti.talk
ghoti.memorize(msg)
else
msg = ghoti.talk(str)
ghoti.parse_and_generate(msg)
end
puts msg
end
else
ghoti = Ghoti.new(ARGV[0])
str = ARGV[1]
msg = ghoti.talk(str)
puts msg
end
at_exit {
ghoti.out_markovchain
ghoti.out_keywords
ghoti.out_keyword_location
ghoti.out_keywords_recent
}
end
               (
geocities.com/svnseeds)