2020-04-10 20:31:41 +00:00
|
|
|
#!/usr/bin/env racket
|
2020-04-10 23:12:50 +00:00
|
|
|
#lang racket
|
|
|
|
; vim: ft=racket
|
2020-04-10 20:36:38 +00:00
|
|
|
(require smol-http)
|
|
|
|
|
2020-04-10 23:23:14 +00:00
|
|
|
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
|
|
|
|
;; christo pls do not track thanks
|
|
|
|
(dnt . "1")))
|
|
|
|
|
2020-04-11 00:13:04 +00:00
|
|
|
(struct cj [jar cache sema] #:transparent)
|
|
|
|
(define (make-cookie-jar)
|
|
|
|
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
|
|
|
(define (update-cookie-jar! jar hdrs)
|
|
|
|
(call-with-semaphore
|
|
|
|
(cj-sema jar)
|
|
|
|
(lambda ()
|
|
|
|
(for ([hdr (in-list hdrs)])
|
|
|
|
(when (symbol=? 'set-cookie (car hdr))
|
|
|
|
(define kv (string-trim (first (string-split (cdr hdr) ";"))))
|
|
|
|
(match-define (list k v) (string-split kv "="))
|
|
|
|
(hash-set! (cj-jar jar) k v)))
|
|
|
|
(set-box!
|
|
|
|
(cj-cache jar)
|
|
|
|
(cons 'cookie
|
|
|
|
(string-join
|
|
|
|
(for/list ([(k v) (in-hash jar)])
|
|
|
|
(format "~a=~a" k v)) "; "))))))
|
|
|
|
(define (cookie-jar->header jar)
|
|
|
|
(call-with-semaphore (cj-sema jar) (curry unbox (cj-cache jar))))
|
|
|
|
|
|
|
|
(define debug-mode? (make-parameter #f))
|
|
|
|
(define current-cookie-jar (make-parameter (make-cookie-jar)))
|
|
|
|
|
|
|
|
(define (printf-debug . args)
|
|
|
|
(when (debug-mode?) (apply printf args)))
|
|
|
|
|
|
|
|
(define (print-progress total-pages in-flight crawled-pages num-flags)
|
|
|
|
(when (debug-mode?)
|
|
|
|
(printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a"
|
|
|
|
crawled-pages total-pages in-flight num-flags)
|
|
|
|
(flush-output (current-output-port))))
|
|
|
|
|
|
|
|
(define (print-complete)
|
|
|
|
(when (debug-mode?)
|
|
|
|
(printf "\r\x1b[KCrawl complete\n")))
|
|
|
|
|
2020-04-10 20:36:38 +00:00
|
|
|
(define req
|
2020-04-10 23:23:14 +00:00
|
|
|
(make-http-req 'GET "/hello-world" DEFAULT-HDRS))
|
2020-04-10 20:31:41 +00:00
|
|
|
|
2020-04-10 23:12:50 +00:00
|
|
|
(define (run-webcrawler username password)
|
2020-04-11 00:13:04 +00:00
|
|
|
(printf-debug "the credentials are: ~s ~s\n" username password)
|
2020-04-10 20:36:38 +00:00
|
|
|
(write-http-msg req))
|
2020-04-10 23:12:50 +00:00
|
|
|
|
|
|
|
(module+ main
|
|
|
|
(command-line
|
|
|
|
#:program "webcrawler"
|
2020-04-11 00:13:04 +00:00
|
|
|
#:once-each
|
|
|
|
[("-d") "Debug mode" (debug-mode? #t)]
|
2020-04-10 23:12:50 +00:00
|
|
|
#:args
|
|
|
|
(username password)
|
|
|
|
(run-webcrawler username password)))
|