Improve text generation

This commit is contained in:
arne 2024-01-06 10:50:31 +01:00
commit a3e2ef331f
2 changed files with 39 additions and 28 deletions

View file

@ -6,24 +6,21 @@ František Kafka is a bot that generates text using Markov chains. It is online
## Installation ## Installation
You need your input text in plain text; [Project Gutenberg](https://www.gutenberg.org) is a good source for that. Make sure to remove all parts of the text that are not to be used as the bot's corpus.<sup>1</sup> You need your input text in plain text; [Project Gutenberg](https://www.gutenberg.org) is a good source for that. Make sure to remove all parts of the text that are not to be used as the bot's corpus.
You will also need an access token, so register your app at a mastodon instance of choice to get one. Copy the `.env.example` to `.env` and edit it to set up the environment correctly. You will also need an access token, so register your app at a mastodon instance of choice to get one.
Building an running the Clojure code requires [leiningen](https://leiningen.org/) version 2.8 or above.
1: This means removing the copyright notice, which is the reason why there is no text included in this repository. Building an running the Clojure code requires [babashka](https://babashka.org/), version 1.3 is tested and should work.
## Usage ## Usage
``` ```
# build it and run it $ bb -m heyarne.frantisek-kafka.samsa --interval=5 --order=3 resources/kafka-en.txt
$ lein uberjar
$ java -jar target/uberjar/heyarne.frantisek-kafka-0.1.0-standalone.jar [args]
``` ```
## Options ## Options
``` ```
$ lein run -- --help $ bb -m heyarne.frantisek-kafka.samsa --help
František Kafka is a Markov chain bot that infinitely generates text František Kafka is a Markov chain bot that infinitely generates text
from one or more text files. botsin.space/@frantisek hosts a live from one or more text files. botsin.space/@frantisek hosts a live
version of it that generates an infinite version of Franz Kafka's Metamorphosis. version of it that generates an infinite version of Franz Kafka's Metamorphosis.

View file

@ -10,7 +10,8 @@
;; parsing the input text ;; parsing the input text
(defn read-corpus [files] (defn read-corpus [files]
(flatten (for [file files] (into [] cat
(for [file files]
(-> (slurp file) (-> (slurp file)
(str/replace #"[^\sa-zA-ZäöüÄÖÜß',.:!?]" "") (str/replace #"[^\sa-zA-ZäöüÄÖÜß',.:!?]" "")
(str/split #"\s+"))))) (str/split #"\s+")))))
@ -21,17 +22,28 @@
(and (and
(re-find #"[.?!]$" last-word) (re-find #"[.?!]$" last-word)
(or (or
(not (#{"Mr." "Mrs."} last-word)) (not (#{"Mr." "Mrs." "K."} last-word))
;; if it ends with a title, the title needs an article before that ;; if it ends with a title, the title needs an article before that
(and (#{"Mr." "Mrs."} last-word) (and (#{"Mr." "Mrs."} last-word)
(not (#{"the" "a"} second-to-last))))))) (not (#{"the" "a"} second-to-last)))))))
(defn sentence-start? [state] (def sentence-starts
(some? (re-find #"^[A-Z]" (first state)))) (memoize
(fn [corpus]
(->>
(map vector corpus (rest corpus))
(filter (fn [[a b]]
(and (re-find #"[.?!]$" a)
(re-find #"^[A-ZÄÖÜ]" b))))
(map second)
(remove #{"K." "Sie" "Ihnen" "Herr"})
(into #{})))))
(defn generate-sentence [markov-chain] (defn generate-sentence [corpus markov-chain]
(let [start (->> (markov/states markov-chain) (let [starts (sentence-starts corpus)
(filter sentence-start?) start (->>
(markov/states markov-chain)
(filter #(starts (first %)))
(rand-nth))] (rand-nth))]
(str/join " " (markov/generate markov-chain start sentence-ending?)))) (str/join " " (markov/generate markov-chain start sentence-ending?))))
@ -105,23 +117,25 @@
(defn examples (defn examples
"Will return a preview of the text that will be generated." "Will return a preview of the text that will be generated."
[corpus order] [corpus order]
(let [markov-chain (->> (read-corpus corpus) (let [corpus (read-corpus corpus)
(markov/chain order))] markov-chain (markov/chain order corpus)]
(repeatedly 10 #(generate-sentence markov-chain)))) markov-chain
#_(repeatedly 10 #(generate-sentence corpus markov-chain))))
(defn start! [{:keys [corpus order interval]}] (defn start! [{:keys [corpus order interval]}]
(let [markov-chain (->> (read-corpus corpus) (let [corpus (read-corpus corpus)
(markov/chain order))] markov-chain (markov/chain order corpus)]
(log/warn "David Foster Wallace once claimed that Kafka sat in his room at night, writing his stories and driving all of his neighbors into insanity because he could not stop laughing manically. I don't know if that is true, but it is an interesting story to tell, right?") (log/warn "David Foster Wallace once claimed that Kafka sat in his room at night, writing his stories and driving all of his neighbors into insanity because he could not stop laughing manically. I don't know if that is true, but it is an interesting story to tell, right?")
(when-not (environment-setup?) (when-not (environment-setup?)
(log/warn "Please set :access-token and :mastodon-instance in the .env file.") (log/warn "Please set the ACCESS_TOKEN and MASTODON_INSTANCE enviornment variables.")
(log/warn "The bot is running in debug mode for now.")) (log/warn "The bot is running in debug mode for now."))
(loop [sentence (generate-sentence markov-chain)] (loop [sentence (generate-sentence corpus markov-chain)]
(when (< (count-words sentence) 20) (when (< (count-words sentence) 20)
;; shorter sentences are more likely to be coherent. :) ;; shorter sentences are more likely to be coherent. :)
(send-toot! sentence) (send-toot! sentence)
(Thread/sleep (* interval 1000))) (Thread/sleep (* interval 1000)))
(recur (generate-sentence markov-chain))))) (recur (generate-sentence corpus markov-chain)))))
(defn -main [& args] (defn -main [& args]
(let [parsed-args (validate-args args)] (let [parsed-args (validate-args args)]