diff --git a/src/heyarne/frantisek_kafka/markov.clj b/src/heyarne/frantisek_kafka/markov.clj index ef3e356..95b6c73 100644 --- a/src/heyarne/frantisek_kafka/markov.clj +++ b/src/heyarne/frantisek_kafka/markov.clj @@ -28,7 +28,7 @@ (loop [phrase (vec start)] (let [next-word (rand-nth (get markov-chain (take-last (count start) phrase))) next-phrase (conj phrase next-word)] - (if (should-end? next-word) + (if (should-end? next-phrase) next-phrase (recur next-phrase))))) diff --git a/src/heyarne/frantisek_kafka/samsa.clj b/src/heyarne/frantisek_kafka/samsa.clj index 4293eb0..ac07cbc 100644 --- a/src/heyarne/frantisek_kafka/samsa.clj +++ b/src/heyarne/frantisek_kafka/samsa.clj @@ -13,10 +13,19 @@ (defn read-corpus [files] (flatten (for [file files] (-> (slurp file) - (str/replace #"\"" "") + (str/replace #"[^\sa-zA-ZäöüÄÖÜß',.:!?]" "") (str/split #"\s+"))))) -(def sentence-ending? #(some? (re-find #"[.?!]$" %))) +(defn sentence-ending? [words] + (let [last-word (last words) + second-to-last (last (butlast words))] + (and + (re-find #"[.?!]$" last-word) + (or + (not (#{"Mr." "Mrs."} last-word)) + ;; if it ends with a title, the title needs an article before that + (and (#{"Mr." "Mrs."} last-word) + (not (#{"the" "a"} second-to-last))))))) (defn sentence-start? [state] (some? (re-find #"^[A-Z]" (first state)))) @@ -78,7 +87,7 @@ {:exit-message (error-message errors)} (> (count arguments) 0) - {:corpus arguments :order (:order options) :interval (:interval options)} + (merge options {:corpus arguments}) :else {:exit-message (usage summary)})))