(module transform () (import scheme) (import (chicken base)) (import (chicken file)) (import (chicken irregex)) (import (chicken process-context)) (import (chicken string)) (import (srfi 1)) (import (srfi 69)) (import medea) (define stats (make-hash-table)) ;;; string utils (define (string-trim string) (irregex-replace/all '(or (: bos (+ space)) (: (+ space) eos)) string "")) (define (string-strip-quotes string) (irregex-replace/all "'" string "")) (define (string-downcase string) (list->string (map char-downcase (string->list string)))) ;;; parsing (define (parse-wiki-json path) (let* ((json (call-with-input-file path read-json)) (pages (map cdr (alist-ref 'pages (alist-ref 'query json))))) (filter-map parse-page-json pages))) (define (parse-page-json json) (let* ((page-id (alist-ref 'pageid json)) (title (alist-ref 'title json)) (revision (vector-ref (alist-ref 'revisions json) 0)) (text (alist-ref '* (alist-ref 'main (alist-ref 'slots revision)))) (attrs (extract-mycomorphbox text))) (if (and (not (irregex-search "^User:" title)) attrs) (cons (string->symbol (number->string page-id)) (cons `(title . ,title) attrs)) #f))) (define (extract-mycomorphbox string) (and-let* ((irx (irregex "{{[^}]*(mycomorphbox[^}]+)}}" 'case-insensitive)) (match (irregex-search irx string)) (submatch (irregex-match-substring match 1))) (parse-mycomorphbox submatch))) (define (normalize-string string) (string-downcase (string-strip-quotes (string-trim string)))) (define (bogus-attr? attr) (let ((key (car attr)) (value (cadr attr))) (or (equal? value "") ;; fairy-like trips my ass (and (equal? key "howedible") (substring-index "fairy" value))))) (define (remap-attr attr) (let ((key (car attr)) (value (cadr attr))) (cond ((and (equal? key "whichgills") (equal? value "adnex")) '("whichgills" "adnexed")) ((and (equal? key "sporeprintcolor") (equal? value "white to yellow")) '("sporeprintcolor" "yellow")) (else attr)))) (define (normalize-numbered-attrs attrs) (define (strip-number s) (irregex-replace "[0-9]+$" s "")) (define (make-cons value) (lambda (values) (cons value values))) (define (hash-table->json-alist dedup) (let ((object (make-hash-table))) (hash-table-walk dedup (lambda (key value) (hash-table-set! object (string->symbol key) (list->vector value)))) (hash-table->alist object))) (define (unique list) (let ((dedup (make-hash-table))) (for-each (lambda (item) (hash-table-set! dedup item #t)) list) (hash-table-keys dedup))) (let ((dedup (make-hash-table))) (for-each (lambda (attr) (let* ((key (strip-number (car attr))) (value (cadr attr))) (hash-table-update!/default stats key (make-cons value) '()) (hash-table-update!/default dedup key (make-cons value) '()))) attrs) (hash-table-walk stats (lambda (key value) (hash-table-set! stats key (unique value)))) (hash-table->json-alist dedup))) (define (parse-mycomorphbox string) (let* ((lines (cdr (string-split string "|"))) (lines (filter (lambda (line) (substring-index "=" line)) lines)) (attrs (map (lambda (line) (string-split line "=" #t)) lines)) (attrs (map (lambda (attr) (map normalize-string attr)) attrs)) (attrs (map remap-attr attrs)) (attrs (remove bogus-attr? attrs)) (attrs (normalize-numbered-attrs attrs))) attrs)) (define (main dir json-path stats-path) (import (chicken pretty-print)) (call-with-output-file json-path (lambda (out) (write-json (find-files dir test: ".*\\.json" action: (lambda (path json) (append (parse-wiki-json path) json))) out))) (call-with-output-file stats-path (lambda (out) (pp (hash-table->alist stats) out)))) (apply main (command-line-arguments)) )