Add comments (and now it's sleepy time :iitalics: :blobcatsleepreach:)

This commit is contained in:
xenia 2020-04-11 04:57:08 -04:00
parent 1c9c029768
commit 4475e50cc5
2 changed files with 33 additions and 0 deletions

View File

@ -46,6 +46,8 @@
;; -> CookieJar ;; -> CookieJar
;; Creates empty cookie jar ;; Creates empty cookie jar
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
;; modification
(define (make-cookie-jar) (define (make-cookie-jar)
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1))) (cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
@ -94,6 +96,7 @@
(match xexpr (match xexpr
[(list _ (? (curry member? '(class "secret_flag"))) str) [(list _ (? (curry member? '(class "secret_flag"))) str)
(match (string-split (string-trim str) " ") (match (string-split (string-trim str) " ")
;; Match the exact flag format
[(list "FLAG:" (? (compose (curry = 64) string-length) flag)) [(list "FLAG:" (? (compose (curry = 64) string-length) flag))
(list flag)] (list flag)]
[_ '()])] [_ '()])]

View File

@ -25,15 +25,20 @@
;; Returns response for request without closing the socket ;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f]) (define (crawler-fetch/noclose method path [body #f])
;; Create the request based on the given parameters
(define req (define req
(let ([basic-req (make-http-req method path '())]) (let ([basic-req (make-http-req method path '())])
(if body (http-set-body basic-req body) basic-req))) (if body (http-set-body basic-req body) basic-req)))
;; Do the thing, with the thing
;; Include cookie jar cookies
(define sock (define sock
(http-connect (http-connect
HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
DEFAULT-HDRS))) DEFAULT-HDRS)))
(define rsp (http-request sock req)) (define rsp (http-request sock req))
;; If we got any new cookies (yum!!) add them to the jar
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
;; Abstract over some response codes we can handle directly here
(match (http-rsp-code rsp) (match (http-rsp-code rsp)
[(or 301 302) [(or 301 302)
;; handle redirects transparently ;; handle redirects transparently
@ -57,10 +62,14 @@
(define (crawler-fetch/xexpr . params) (define (crawler-fetch/xexpr . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params)) (define-values [rsp sock] (apply crawler-fetch/noclose params))
(match (http-rsp-code rsp) (match (http-rsp-code rsp)
;; return an empty response for 403 and 404. the page won't be visited again
;; because it will have been added to the complete set
[(or 403 404) [(or 403 404)
(http-close sock) (http-close sock)
'()] '()]
;; we shouldn't run into this one
[400 (error "you screwed up, got error 400 :angery:" params)] [400 (error "you screwed up, got error 400 :angery:" params)]
;; normal response yay!!
[200 [200
(define xe (define xe
(string->xexpr (string->xexpr
@ -101,6 +110,7 @@
[flags (find-flags xe)] [flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x)) [page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))]) (find-hrefs xe))])
;; only return URLs that are OK to crawl
(values (filter crawler-valid-url? page-links) flags))) (values (filter crawler-valid-url? page-links) flags)))
;; --------------------------------------------------------------------------------------- ;; ---------------------------------------------------------------------------------------
@ -108,32 +118,47 @@
(define NUM-TASKS 256) (define NUM-TASKS 256)
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
(define (crawler-worker-thread worker-id jobserver) (define (crawler-worker-thread worker-id jobserver)
(let loop () (let loop ()
(define next-job (thread-receive)) (define next-job (thread-receive))
;; jobserver sends a #f when it's time to exit
(when next-job (when next-job
(define-values (new-urls flags) (crawler-iterate-one next-job)) (define-values (new-urls flags) (crawler-iterate-one next-job))
(thread-send jobserver (list worker-id next-job new-urls flags)) (thread-send jobserver (list worker-id next-job new-urls flags))
(loop)))) (loop))))
;; Starts up all the threads and schedules jobs until crawling is complete
(define (crawler-jobserver root-url) (define (crawler-jobserver root-url)
;; Inner function to be run in its own thread
(define (jobserver-thread) (define (jobserver-thread)
;; Create workers
(define worker-threads (define worker-threads
(let ([parent (current-thread)]) (let ([parent (current-thread)])
(for/vector ([i (in-range NUM-TASKS)]) (for/vector ([i (in-range NUM-TASKS)])
(thread (lambda () (crawler-worker-thread i parent)))))) (thread (lambda () (crawler-worker-thread i parent))))))
;; Create worker status - #t means the worker is ready for the next task
;; #f means the worker is currently busy
(define worker-ready (make-vector NUM-TASKS #t)) (define worker-ready (make-vector NUM-TASKS #t))
;; URLs we have yet to crawl
(define pending (mutable-set root-url)) (define pending (mutable-set root-url))
;; URLs we have crawled already
(define completed (mutable-set)) (define completed (mutable-set))
;; All flags seen so far
(define all-flags (mutable-set)) (define all-flags (mutable-set))
;; Main loop
(let loop () (let loop ()
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r))) (define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
;; Loop until there are no more busy workers and there are also no more pending URLs
;; Then, we are done
(unless (and (not any-busy) (set-empty? pending)) (unless (and (not any-busy) (set-empty? pending))
;; Find any available worker to give the next job to
(define available-worker (define available-worker
(for/first ([w (in-naturals)] (for/first ([w (in-naturals)]
[r (in-vector worker-ready)] [r (in-vector worker-ready)]
#:when r) #:when r)
w)) w))
;; If there is a worker and a job, assign the job
(when (and (not (set-empty? pending)) available-worker) (when (and (not (set-empty? pending)) available-worker)
(define next-job (set-first pending)) (define next-job (set-first pending))
(set-remove! pending next-job) (set-remove! pending next-job)
@ -141,9 +166,12 @@
(vector-set! worker-ready available-worker #f) (vector-set! worker-ready available-worker #f)
(set! any-busy #t)) (set! any-busy #t))
;; If there are no more jobs right now or there are no available workers, wait for one of
;; the workers to complete its job
(when (or (set-empty? pending) (false? available-worker)) (when (or (set-empty? pending) (false? available-worker))
(match-define (list worker-id job-url new-urls flags) (thread-receive)) (match-define (list worker-id job-url new-urls flags) (thread-receive))
(vector-set! worker-ready worker-id #t) (vector-set! worker-ready worker-id #t)
;; Update our state with the results
(set-add! completed job-url) (set-add! completed job-url)
(for ([u (in-list new-urls)]) (for ([u (in-list new-urls)])
(unless (set-member? completed u) (unless (set-member? completed u)
@ -152,6 +180,7 @@
(print-flag flag) (print-flag flag)
(set-add! all-flags flag))) (set-add! all-flags flag)))
;; Useful debug status is printed in debug mode
(print-progress (+ (set-count completed) (set-count pending)) (print-progress (+ (set-count completed) (set-count pending))
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1) (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
(set-count completed) (set-count all-flags)) (set-count completed) (set-count all-flags))
@ -176,6 +205,7 @@
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(crawler-jobserver base-url)) (crawler-jobserver base-url))
;; Parse command line arguments and run
(module+ main (module+ main
(command-line (command-line
#:program "webcrawler" #:program "webcrawler"