Add comments (and now it's sleepy time :iitalics: :blobcatsleepreach:)
This commit is contained in:
parent
1c9c029768
commit
4475e50cc5
|
@ -46,6 +46,8 @@
|
|||
|
||||
;; -> CookieJar
|
||||
;; Creates empty cookie jar
|
||||
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
|
||||
;; modification
|
||||
(define (make-cookie-jar)
|
||||
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
||||
|
||||
|
@ -94,6 +96,7 @@
|
|||
(match xexpr
|
||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||
(match (string-split (string-trim str) " ")
|
||||
;; Match the exact flag format
|
||||
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
|
||||
(list flag)]
|
||||
[_ '()])]
|
||||
|
|
30
webcrawler
30
webcrawler
|
@ -25,15 +25,20 @@
|
|||
|
||||
;; Returns response for request without closing the socket
|
||||
(define (crawler-fetch/noclose method path [body #f])
|
||||
;; Create the request based on the given parameters
|
||||
(define req
|
||||
(let ([basic-req (make-http-req method path '())])
|
||||
(if body (http-set-body basic-req body) basic-req)))
|
||||
;; Do the thing, with the thing
|
||||
;; Include cookie jar cookies
|
||||
(define sock
|
||||
(http-connect
|
||||
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
||||
DEFAULT-HDRS)))
|
||||
(define rsp (http-request sock req))
|
||||
;; If we got any new cookies (yum!!) add them to the jar
|
||||
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
||||
;; Abstract over some response codes we can handle directly here
|
||||
(match (http-rsp-code rsp)
|
||||
[(or 301 302)
|
||||
;; handle redirects transparently
|
||||
|
@ -57,10 +62,14 @@
|
|||
(define (crawler-fetch/xexpr . params)
|
||||
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||
(match (http-rsp-code rsp)
|
||||
;; return an empty response for 403 and 404. the page won't be visited again
|
||||
;; because it will have been added to the complete set
|
||||
[(or 403 404)
|
||||
(http-close sock)
|
||||
'()]
|
||||
;; we shouldn't run into this one
|
||||
[400 (error "you screwed up, got error 400 :angery:" params)]
|
||||
;; normal response yay!!
|
||||
[200
|
||||
(define xe
|
||||
(string->xexpr
|
||||
|
@ -101,6 +110,7 @@
|
|||
[flags (find-flags xe)]
|
||||
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
||||
(find-hrefs xe))])
|
||||
;; only return URLs that are OK to crawl
|
||||
(values (filter crawler-valid-url? page-links) flags)))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
|
@ -108,32 +118,47 @@
|
|||
|
||||
(define NUM-TASKS 256)
|
||||
|
||||
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
|
||||
(define (crawler-worker-thread worker-id jobserver)
|
||||
(let loop ()
|
||||
(define next-job (thread-receive))
|
||||
;; jobserver sends a #f when it's time to exit
|
||||
(when next-job
|
||||
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
||||
(thread-send jobserver (list worker-id next-job new-urls flags))
|
||||
(loop))))
|
||||
|
||||
;; Starts up all the threads and schedules jobs until crawling is complete
|
||||
(define (crawler-jobserver root-url)
|
||||
;; Inner function to be run in its own thread
|
||||
(define (jobserver-thread)
|
||||
;; Create workers
|
||||
(define worker-threads
|
||||
(let ([parent (current-thread)])
|
||||
(for/vector ([i (in-range NUM-TASKS)])
|
||||
(thread (lambda () (crawler-worker-thread i parent))))))
|
||||
;; Create worker status - #t means the worker is ready for the next task
|
||||
;; #f means the worker is currently busy
|
||||
(define worker-ready (make-vector NUM-TASKS #t))
|
||||
;; URLs we have yet to crawl
|
||||
(define pending (mutable-set root-url))
|
||||
;; URLs we have crawled already
|
||||
(define completed (mutable-set))
|
||||
;; All flags seen so far
|
||||
(define all-flags (mutable-set))
|
||||
;; Main loop
|
||||
(let loop ()
|
||||
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
||||
;; Loop until there are no more busy workers and there are also no more pending URLs
|
||||
;; Then, we are done
|
||||
(unless (and (not any-busy) (set-empty? pending))
|
||||
;; Find any available worker to give the next job to
|
||||
(define available-worker
|
||||
(for/first ([w (in-naturals)]
|
||||
[r (in-vector worker-ready)]
|
||||
#:when r)
|
||||
w))
|
||||
;; If there is a worker and a job, assign the job
|
||||
(when (and (not (set-empty? pending)) available-worker)
|
||||
(define next-job (set-first pending))
|
||||
(set-remove! pending next-job)
|
||||
|
@ -141,9 +166,12 @@
|
|||
(vector-set! worker-ready available-worker #f)
|
||||
(set! any-busy #t))
|
||||
|
||||
;; If there are no more jobs right now or there are no available workers, wait for one of
|
||||
;; the workers to complete its job
|
||||
(when (or (set-empty? pending) (false? available-worker))
|
||||
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
||||
(vector-set! worker-ready worker-id #t)
|
||||
;; Update our state with the results
|
||||
(set-add! completed job-url)
|
||||
(for ([u (in-list new-urls)])
|
||||
(unless (set-member? completed u)
|
||||
|
@ -152,6 +180,7 @@
|
|||
(print-flag flag)
|
||||
(set-add! all-flags flag)))
|
||||
|
||||
;; Useful debug status is printed in debug mode
|
||||
(print-progress (+ (set-count completed) (set-count pending))
|
||||
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
||||
(set-count completed) (set-count all-flags))
|
||||
|
@ -176,6 +205,7 @@
|
|||
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
||||
(crawler-jobserver base-url))
|
||||
|
||||
;; Parse command line arguments and run
|
||||
(module+ main
|
||||
(command-line
|
||||
#:program "webcrawler"
|
||||
|
|
Loading…
Reference in New Issue