mirror of
https://gitdab.com/cadence/breezewiki.git
synced 2026-03-05 13:40:27 -05:00
Add more indexers and Solr configuration
This commit is contained in:
179
archiver/fts.rkt
179
archiver/fts.rkt
@@ -1,4 +1,4 @@
|
||||
#lang racket/base
|
||||
#lang racket
|
||||
(require racket/function
|
||||
racket/future
|
||||
racket/match
|
||||
@@ -9,12 +9,35 @@
|
||||
file/gunzip
|
||||
db
|
||||
db/unsafe/sqlite3
|
||||
net/http-easy
|
||||
json
|
||||
json-pointer
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/tree-updater.rkt")
|
||||
|
||||
(define-syntax (seq stx)
|
||||
(syntax-case stx ()
|
||||
[(_ body ...)
|
||||
#`(for ([op (list (lambda () body) ...)]
|
||||
[i (in-naturals)])
|
||||
(define res (op))
|
||||
(when (>= (response-status-code res) 400)
|
||||
(error 'seq "op #~a: status code was ~a: ~v" i (response-status-code res) (response-json res)))
|
||||
(define taskuid (json-pointer-value "/taskUid" (response-json res)))
|
||||
(for/or ([ticks (in-naturals)]
|
||||
[res2 (in-producer (lambda () (get (format "http://localhost:7700/tasks/~a" taskuid))))])
|
||||
(define status (json-pointer-value "/status" (response-json res2)))
|
||||
(case status
|
||||
[("enqueued" "processing")
|
||||
(sleep 1)
|
||||
#f]
|
||||
[("succeeded")
|
||||
(printf "op #~a: ~a (~a ticks)~n" i status ticks)
|
||||
#t]
|
||||
[else
|
||||
(error 'seq "op #~a: task status was ~a: ~v" i status res2)])))]))
|
||||
|
||||
(define (class-has? attributes substrs)
|
||||
(define cl (or (get-attribute 'class attributes) ""))
|
||||
(ormap (λ (substr) (string-contains? cl substr)) substrs))
|
||||
@@ -29,17 +52,27 @@
|
||||
(define slc (sqlite3-connect #:database "../storage/fts-separate.db"))
|
||||
(sqlite3-load-extension slc "fts5")
|
||||
|
||||
(define (writer page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(memq bit '(div p li td)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
(define (writer tables-mode? page)
|
||||
(define (writer-inner page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)]
|
||||
[(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)]
|
||||
[(memq bit '(div p li td dd dt br)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer-inner bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
(writer-inner page))
|
||||
|
||||
(define wikiname "sto")
|
||||
(define (write-and-post-process tables-mode? page)
|
||||
(define text (with-output-to-string (λ () (writer tables-mode? page))))
|
||||
;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text ""))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
shrink-text)
|
||||
|
||||
(define wikiname "bloons")
|
||||
(define tablename (format "page_~a" wikiname))
|
||||
|
||||
(define ((extract f)) ; f - filename
|
||||
@@ -55,35 +88,115 @@
|
||||
(read-json in)]
|
||||
[else #f]))
|
||||
(define title (json-pointer-value "/parse/title" j))
|
||||
(define pageid (json-pointer-value "/parse/pageid" j))
|
||||
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
|
||||
(define page (update-tree updater (html->xexp page-html)))
|
||||
(define text (with-output-to-string (λ () (writer page))))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
(values title shrink-text)))
|
||||
(define body (write-and-post-process #f page))
|
||||
(define table (write-and-post-process #t page))
|
||||
(values title body table pageid)))
|
||||
|
||||
(println "extracting text...")
|
||||
(define results
|
||||
(time
|
||||
(for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".json" #".gz")))
|
||||
(extract f))))
|
||||
(for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".json" #".gz")))
|
||||
(extract f)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; TESTING WRITER
|
||||
;; ***************************************************************************************************
|
||||
#;(for/first ([fut results]
|
||||
[i (in-naturals 1)]
|
||||
#:when (i . >= . 4859))
|
||||
(define-values (title body table pageid) (fut))
|
||||
(println title)
|
||||
(println body)
|
||||
(println table))
|
||||
|
||||
(println "inserting...")
|
||||
(query-exec slc "begin transaction")
|
||||
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
|
||||
(time
|
||||
(for ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title shrink-text) (fut))
|
||||
(query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
|
||||
|
||||
(println "running optimize...")
|
||||
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
|
||||
;; ***************************************************************************************************
|
||||
;; SQLite FTS5
|
||||
;; ***************************************************************************************************
|
||||
#;(begin
|
||||
(query-exec slc "begin transaction")
|
||||
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
|
||||
(time
|
||||
(for ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title shrink-text) (fut))
|
||||
(query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
|
||||
|
||||
(println "committing...")
|
||||
(query-exec slc "commit")
|
||||
(println "running optimize...")
|
||||
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
|
||||
|
||||
(println "committing...")
|
||||
(query-exec slc "commit"))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Solr
|
||||
;; ***************************************************************************************************
|
||||
(begin
|
||||
(define data
|
||||
(cond
|
||||
#;[(file-exists? "cache.rkt")
|
||||
(println "reading in...")
|
||||
(with-input-from-file "cache.rkt" (λ () (read)))]
|
||||
[else
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title body table pageid) (fut))
|
||||
(define len (string-length body))
|
||||
`#hasheq((id . ,(number->string pageid))
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(table . ,table)
|
||||
(len . ,len))))
|
||||
|
||||
(println "writing out...")
|
||||
(with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace)
|
||||
data]))
|
||||
|
||||
(println "posting...")
|
||||
(define res
|
||||
(post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname)
|
||||
#:json data)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Meilisearch
|
||||
;; ***************************************************************************************************
|
||||
#;(begin
|
||||
(seq
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/searchable-attributes" wikiname)
|
||||
#:json '("title" "body"))
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/ranking-rules" wikiname)
|
||||
#:json '("words" "typo" #;"proximity" "attribute" "sort" "exactness" #;"len:desc"))
|
||||
(call-with-input-file "stop-words.json"
|
||||
(λ (in)
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/stop-words" wikiname)
|
||||
#:headers '#hasheq((Content-Type . "application/json"))
|
||||
#:data in))))
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title body pageid) (fut))
|
||||
(define len (string-length body))
|
||||
`#hasheq((id . ,pageid)
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(len . ,len))))
|
||||
(define res
|
||||
(post (format "http://localhost:7700/indexes/~a/documents" wikiname)
|
||||
#:json data))
|
||||
(seq res)
|
||||
(println (response-json res)))
|
||||
|
||||
(disconnect slc)
|
||||
|
||||
Reference in New Issue
Block a user