Add comments (and now it's sleepy time :iitalics: :blobcatsleepreach:)

This commit is contained in:
xenia 2020-04-11 04:57:08 -04:00
parent 1c9c029768
commit 4475e50cc5
2 changed files with 33 additions and 0 deletions

View File

@ -46,6 +46,8 @@
;; -> CookieJar
;; Creates empty cookie jar
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
;; modification
(define (make-cookie-jar)
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
@ -94,6 +96,7 @@
(match xexpr
[(list _ (? (curry member? '(class "secret_flag"))) str)
(match (string-split (string-trim str) " ")
;; Match the exact flag format
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
(list flag)]
[_ '()])]

View File

@ -25,15 +25,20 @@
;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f])
;; Create the request based on the given parameters
(define req
(let ([basic-req (make-http-req method path '())])
(if body (http-set-body basic-req body) basic-req)))
;; Do the thing, with the thing
;; Include cookie jar cookies
(define sock
(http-connect
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
DEFAULT-HDRS)))
(define rsp (http-request sock req))
;; If we got any new cookies (yum!!) add them to the jar
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
;; Abstract over some response codes we can handle directly here
(match (http-rsp-code rsp)
[(or 301 302)
;; handle redirects transparently
@ -57,10 +62,14 @@
(define (crawler-fetch/xexpr . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
(match (http-rsp-code rsp)
;; return an empty response for 403 and 404. the page won't be visited again
;; because it will have been added to the complete set
[(or 403 404)
(http-close sock)
'()]
;; we shouldn't run into this one
[400 (error "you screwed up, got error 400 :angery:" params)]
;; normal response yay!!
[200
(define xe
(string->xexpr
@ -101,6 +110,7 @@
[flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))])
;; only return URLs that are OK to crawl
(values (filter crawler-valid-url? page-links) flags)))
;; ---------------------------------------------------------------------------------------
@ -108,32 +118,47 @@
(define NUM-TASKS 256)
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
(define (crawler-worker-thread worker-id jobserver)
(let loop ()
(define next-job (thread-receive))
;; jobserver sends a #f when it's time to exit
(when next-job
(define-values (new-urls flags) (crawler-iterate-one next-job))
(thread-send jobserver (list worker-id next-job new-urls flags))
(loop))))
;; Starts up all the threads and schedules jobs until crawling is complete
(define (crawler-jobserver root-url)
;; Inner function to be run in its own thread
(define (jobserver-thread)
;; Create workers
(define worker-threads
(let ([parent (current-thread)])
(for/vector ([i (in-range NUM-TASKS)])
(thread (lambda () (crawler-worker-thread i parent))))))
;; Create worker status - #t means the worker is ready for the next task
;; #f means the worker is currently busy
(define worker-ready (make-vector NUM-TASKS #t))
;; URLs we have yet to crawl
(define pending (mutable-set root-url))
;; URLs we have crawled already
(define completed (mutable-set))
;; All flags seen so far
(define all-flags (mutable-set))
;; Main loop
(let loop ()
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
;; Loop until there are no more busy workers and there are also no more pending URLs
;; Then, we are done
(unless (and (not any-busy) (set-empty? pending))
;; Find any available worker to give the next job to
(define available-worker
(for/first ([w (in-naturals)]
[r (in-vector worker-ready)]
#:when r)
w))
;; If there is a worker and a job, assign the job
(when (and (not (set-empty? pending)) available-worker)
(define next-job (set-first pending))
(set-remove! pending next-job)
@ -141,9 +166,12 @@
(vector-set! worker-ready available-worker #f)
(set! any-busy #t))
;; If there are no more jobs right now or there are no available workers, wait for one of
;; the workers to complete its job
(when (or (set-empty? pending) (false? available-worker))
(match-define (list worker-id job-url new-urls flags) (thread-receive))
(vector-set! worker-ready worker-id #t)
;; Update our state with the results
(set-add! completed job-url)
(for ([u (in-list new-urls)])
(unless (set-member? completed u)
@ -152,6 +180,7 @@
(print-flag flag)
(set-add! all-flags flag)))
;; Useful debug status is printed in debug mode
(print-progress (+ (set-count completed) (set-count pending))
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
(set-count completed) (set-count all-flags))
@ -176,6 +205,7 @@
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(crawler-jobserver base-url))
;; Parse command line arguments and run
(module+ main
(command-line
#:program "webcrawler"