Add comments (and now it's sleepy time :iitalics: :blobcatsleepreach:)
This commit is contained in:
parent
1c9c029768
commit
4475e50cc5
|
@ -46,6 +46,8 @@
|
||||||
|
|
||||||
;; -> CookieJar
|
;; -> CookieJar
|
||||||
;; Creates empty cookie jar
|
;; Creates empty cookie jar
|
||||||
|
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
|
||||||
|
;; modification
|
||||||
(define (make-cookie-jar)
|
(define (make-cookie-jar)
|
||||||
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
||||||
|
|
||||||
|
@ -94,6 +96,7 @@
|
||||||
(match xexpr
|
(match xexpr
|
||||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||||
(match (string-split (string-trim str) " ")
|
(match (string-split (string-trim str) " ")
|
||||||
|
;; Match the exact flag format
|
||||||
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
|
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
|
||||||
(list flag)]
|
(list flag)]
|
||||||
[_ '()])]
|
[_ '()])]
|
||||||
|
|
30
webcrawler
30
webcrawler
|
@ -25,15 +25,20 @@
|
||||||
|
|
||||||
;; Returns response for request without closing the socket
|
;; Returns response for request without closing the socket
|
||||||
(define (crawler-fetch/noclose method path [body #f])
|
(define (crawler-fetch/noclose method path [body #f])
|
||||||
|
;; Create the request based on the given parameters
|
||||||
(define req
|
(define req
|
||||||
(let ([basic-req (make-http-req method path '())])
|
(let ([basic-req (make-http-req method path '())])
|
||||||
(if body (http-set-body basic-req body) basic-req)))
|
(if body (http-set-body basic-req body) basic-req)))
|
||||||
|
;; Do the thing, with the thing
|
||||||
|
;; Include cookie jar cookies
|
||||||
(define sock
|
(define sock
|
||||||
(http-connect
|
(http-connect
|
||||||
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
||||||
DEFAULT-HDRS)))
|
DEFAULT-HDRS)))
|
||||||
(define rsp (http-request sock req))
|
(define rsp (http-request sock req))
|
||||||
|
;; If we got any new cookies (yum!!) add them to the jar
|
||||||
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
||||||
|
;; Abstract over some response codes we can handle directly here
|
||||||
(match (http-rsp-code rsp)
|
(match (http-rsp-code rsp)
|
||||||
[(or 301 302)
|
[(or 301 302)
|
||||||
;; handle redirects transparently
|
;; handle redirects transparently
|
||||||
|
@ -57,10 +62,14 @@
|
||||||
(define (crawler-fetch/xexpr . params)
|
(define (crawler-fetch/xexpr . params)
|
||||||
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||||
(match (http-rsp-code rsp)
|
(match (http-rsp-code rsp)
|
||||||
|
;; return an empty response for 403 and 404. the page won't be visited again
|
||||||
|
;; because it will have been added to the complete set
|
||||||
[(or 403 404)
|
[(or 403 404)
|
||||||
(http-close sock)
|
(http-close sock)
|
||||||
'()]
|
'()]
|
||||||
|
;; we shouldn't run into this one
|
||||||
[400 (error "you screwed up, got error 400 :angery:" params)]
|
[400 (error "you screwed up, got error 400 :angery:" params)]
|
||||||
|
;; normal response yay!!
|
||||||
[200
|
[200
|
||||||
(define xe
|
(define xe
|
||||||
(string->xexpr
|
(string->xexpr
|
||||||
|
@ -101,6 +110,7 @@
|
||||||
[flags (find-flags xe)]
|
[flags (find-flags xe)]
|
||||||
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
||||||
(find-hrefs xe))])
|
(find-hrefs xe))])
|
||||||
|
;; only return URLs that are OK to crawl
|
||||||
(values (filter crawler-valid-url? page-links) flags)))
|
(values (filter crawler-valid-url? page-links) flags)))
|
||||||
|
|
||||||
;; ---------------------------------------------------------------------------------------
|
;; ---------------------------------------------------------------------------------------
|
||||||
|
@ -108,32 +118,47 @@
|
||||||
|
|
||||||
(define NUM-TASKS 256)
|
(define NUM-TASKS 256)
|
||||||
|
|
||||||
|
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
|
||||||
(define (crawler-worker-thread worker-id jobserver)
|
(define (crawler-worker-thread worker-id jobserver)
|
||||||
(let loop ()
|
(let loop ()
|
||||||
(define next-job (thread-receive))
|
(define next-job (thread-receive))
|
||||||
|
;; jobserver sends a #f when it's time to exit
|
||||||
(when next-job
|
(when next-job
|
||||||
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
||||||
(thread-send jobserver (list worker-id next-job new-urls flags))
|
(thread-send jobserver (list worker-id next-job new-urls flags))
|
||||||
(loop))))
|
(loop))))
|
||||||
|
|
||||||
|
;; Starts up all the threads and schedules jobs until crawling is complete
|
||||||
(define (crawler-jobserver root-url)
|
(define (crawler-jobserver root-url)
|
||||||
|
;; Inner function to be run in its own thread
|
||||||
(define (jobserver-thread)
|
(define (jobserver-thread)
|
||||||
|
;; Create workers
|
||||||
(define worker-threads
|
(define worker-threads
|
||||||
(let ([parent (current-thread)])
|
(let ([parent (current-thread)])
|
||||||
(for/vector ([i (in-range NUM-TASKS)])
|
(for/vector ([i (in-range NUM-TASKS)])
|
||||||
(thread (lambda () (crawler-worker-thread i parent))))))
|
(thread (lambda () (crawler-worker-thread i parent))))))
|
||||||
|
;; Create worker status - #t means the worker is ready for the next task
|
||||||
|
;; #f means the worker is currently busy
|
||||||
(define worker-ready (make-vector NUM-TASKS #t))
|
(define worker-ready (make-vector NUM-TASKS #t))
|
||||||
|
;; URLs we have yet to crawl
|
||||||
(define pending (mutable-set root-url))
|
(define pending (mutable-set root-url))
|
||||||
|
;; URLs we have crawled already
|
||||||
(define completed (mutable-set))
|
(define completed (mutable-set))
|
||||||
|
;; All flags seen so far
|
||||||
(define all-flags (mutable-set))
|
(define all-flags (mutable-set))
|
||||||
|
;; Main loop
|
||||||
(let loop ()
|
(let loop ()
|
||||||
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
||||||
|
;; Loop until there are no more busy workers and there are also no more pending URLs
|
||||||
|
;; Then, we are done
|
||||||
(unless (and (not any-busy) (set-empty? pending))
|
(unless (and (not any-busy) (set-empty? pending))
|
||||||
|
;; Find any available worker to give the next job to
|
||||||
(define available-worker
|
(define available-worker
|
||||||
(for/first ([w (in-naturals)]
|
(for/first ([w (in-naturals)]
|
||||||
[r (in-vector worker-ready)]
|
[r (in-vector worker-ready)]
|
||||||
#:when r)
|
#:when r)
|
||||||
w))
|
w))
|
||||||
|
;; If there is a worker and a job, assign the job
|
||||||
(when (and (not (set-empty? pending)) available-worker)
|
(when (and (not (set-empty? pending)) available-worker)
|
||||||
(define next-job (set-first pending))
|
(define next-job (set-first pending))
|
||||||
(set-remove! pending next-job)
|
(set-remove! pending next-job)
|
||||||
|
@ -141,9 +166,12 @@
|
||||||
(vector-set! worker-ready available-worker #f)
|
(vector-set! worker-ready available-worker #f)
|
||||||
(set! any-busy #t))
|
(set! any-busy #t))
|
||||||
|
|
||||||
|
;; If there are no more jobs right now or there are no available workers, wait for one of
|
||||||
|
;; the workers to complete its job
|
||||||
(when (or (set-empty? pending) (false? available-worker))
|
(when (or (set-empty? pending) (false? available-worker))
|
||||||
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
||||||
(vector-set! worker-ready worker-id #t)
|
(vector-set! worker-ready worker-id #t)
|
||||||
|
;; Update our state with the results
|
||||||
(set-add! completed job-url)
|
(set-add! completed job-url)
|
||||||
(for ([u (in-list new-urls)])
|
(for ([u (in-list new-urls)])
|
||||||
(unless (set-member? completed u)
|
(unless (set-member? completed u)
|
||||||
|
@ -152,6 +180,7 @@
|
||||||
(print-flag flag)
|
(print-flag flag)
|
||||||
(set-add! all-flags flag)))
|
(set-add! all-flags flag)))
|
||||||
|
|
||||||
|
;; Useful debug status is printed in debug mode
|
||||||
(print-progress (+ (set-count completed) (set-count pending))
|
(print-progress (+ (set-count completed) (set-count pending))
|
||||||
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
||||||
(set-count completed) (set-count all-flags))
|
(set-count completed) (set-count all-flags))
|
||||||
|
@ -176,6 +205,7 @@
|
||||||
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
||||||
(crawler-jobserver base-url))
|
(crawler-jobserver base-url))
|
||||||
|
|
||||||
|
;; Parse command line arguments and run
|
||||||
(module+ main
|
(module+ main
|
||||||
(command-line
|
(command-line
|
||||||
#:program "webcrawler"
|
#:program "webcrawler"
|
||||||
|
|
Loading…
Reference in New Issue