Improve text generation

This commit is contained in:
arne 2024-01-06 10:50:31 +01:00
commit a3e2ef331f
2 changed files with 39 additions and 28 deletions

View file

@ -6,24 +6,21 @@ František Kafka is a bot that generates text using Markov chains. It is online
## Installation
You need your input text in plain text; [Project Gutenberg](https://www.gutenberg.org) is a good source for that. Make sure to remove all parts of the text that are not to be used as the bot's corpus.<sup>1</sup>
You will also need an access token, so register your app at a mastodon instance of choice to get one. Copy the `.env.example` to `.env` and edit it to set up the environment correctly.
Building an running the Clojure code requires [leiningen](https://leiningen.org/) version 2.8 or above.
You need your input text in plain text; [Project Gutenberg](https://www.gutenberg.org) is a good source for that. Make sure to remove all parts of the text that are not to be used as the bot's corpus.
You will also need an access token, so register your app at a mastodon instance of choice to get one.
1: This means removing the copyright notice, which is the reason why there is no text included in this repository.
Building an running the Clojure code requires [babashka](https://babashka.org/), version 1.3 is tested and should work.
## Usage
```
# build it and run it
$ lein uberjar
$ java -jar target/uberjar/heyarne.frantisek-kafka-0.1.0-standalone.jar [args]
$ bb -m heyarne.frantisek-kafka.samsa --interval=5 --order=3 resources/kafka-en.txt
```
## Options
```
$ lein run -- --help
$ bb -m heyarne.frantisek-kafka.samsa --help
František Kafka is a Markov chain bot that infinitely generates text
from one or more text files. botsin.space/@frantisek hosts a live
version of it that generates an infinite version of Franz Kafka's Metamorphosis.

View file

@ -10,10 +10,11 @@
;; parsing the input text
(defn read-corpus [files]
(flatten (for [file files]
(-> (slurp file)
(str/replace #"[^\sa-zA-ZäöüÄÖÜß',.:!?]" "")
(str/split #"\s+")))))
(into [] cat
(for [file files]
(-> (slurp file)
(str/replace #"[^\sa-zA-ZäöüÄÖÜß',.:!?]" "")
(str/split #"\s+")))))
(defn sentence-ending? [words]
(let [last-word (last words)
@ -21,18 +22,29 @@
(and
(re-find #"[.?!]$" last-word)
(or
(not (#{"Mr." "Mrs."} last-word))
(not (#{"Mr." "Mrs." "K."} last-word))
;; if it ends with a title, the title needs an article before that
(and (#{"Mr." "Mrs."} last-word)
(not (#{"the" "a"} second-to-last)))))))
(defn sentence-start? [state]
(some? (re-find #"^[A-Z]" (first state))))
(def sentence-starts
(memoize
(fn [corpus]
(->>
(map vector corpus (rest corpus))
(filter (fn [[a b]]
(and (re-find #"[.?!]$" a)
(re-find #"^[A-ZÄÖÜ]" b))))
(map second)
(remove #{"K." "Sie" "Ihnen" "Herr"})
(into #{})))))
(defn generate-sentence [markov-chain]
(let [start (->> (markov/states markov-chain)
(filter sentence-start?)
(rand-nth))]
(defn generate-sentence [corpus markov-chain]
(let [starts (sentence-starts corpus)
start (->>
(markov/states markov-chain)
(filter #(starts (first %)))
(rand-nth))]
(str/join " " (markov/generate markov-chain start sentence-ending?))))
;; interacting with the API
@ -105,23 +117,25 @@
(defn examples
"Will return a preview of the text that will be generated."
[corpus order]
(let [markov-chain (->> (read-corpus corpus)
(markov/chain order))]
(repeatedly 10 #(generate-sentence markov-chain))))
(let [corpus (read-corpus corpus)
markov-chain (markov/chain order corpus)]
markov-chain
#_(repeatedly 10 #(generate-sentence corpus markov-chain))))
(defn start! [{:keys [corpus order interval]}]
(let [markov-chain (->> (read-corpus corpus)
(markov/chain order))]
(let [corpus (read-corpus corpus)
markov-chain (markov/chain order corpus)]
(log/warn "David Foster Wallace once claimed that Kafka sat in his room at night, writing his stories and driving all of his neighbors into insanity because he could not stop laughing manically. I don't know if that is true, but it is an interesting story to tell, right?")
(when-not (environment-setup?)
(log/warn "Please set :access-token and :mastodon-instance in the .env file.")
(log/warn "Please set the ACCESS_TOKEN and MASTODON_INSTANCE enviornment variables.")
(log/warn "The bot is running in debug mode for now."))
(loop [sentence (generate-sentence markov-chain)]
(loop [sentence (generate-sentence corpus markov-chain)]
(when (< (count-words sentence) 20)
;; shorter sentences are more likely to be coherent. :)
(send-toot! sentence)
(Thread/sleep (* interval 1000)))
(recur (generate-sentence markov-chain)))))
(recur (generate-sentence corpus markov-chain)))))
(defn -main [& args]
(let [parsed-args (validate-args args)]