diff --git a/private/util.rkt b/private/util.rkt index 43b6009..d355d37 100644 --- a/private/util.rkt +++ b/private/util.rkt @@ -46,6 +46,8 @@ ;; -> CookieJar ;; Creates empty cookie jar +;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent +;; modification (define (make-cookie-jar) (cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1))) @@ -94,6 +96,7 @@ (match xexpr [(list _ (? (curry member? '(class "secret_flag"))) str) (match (string-split (string-trim str) " ") + ;; Match the exact flag format [(list "FLAG:" (? (compose (curry = 64) string-length) flag)) (list flag)] [_ '()])] diff --git a/webcrawler b/webcrawler index 27d9073..db19728 100755 --- a/webcrawler +++ b/webcrawler @@ -25,15 +25,20 @@ ;; Returns response for request without closing the socket (define (crawler-fetch/noclose method path [body #f]) + ;; Create the request based on the given parameters (define req (let ([basic-req (make-http-req method path '())]) (if body (http-set-body basic-req body) basic-req))) + ;; Do the thing, with the thing + ;; Include cookie jar cookies (define sock (http-connect HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) DEFAULT-HDRS))) (define rsp (http-request sock req)) + ;; If we got any new cookies (yum!!) add them to the jar (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) + ;; Abstract over some response codes we can handle directly here (match (http-rsp-code rsp) [(or 301 302) ;; handle redirects transparently @@ -57,10 +62,14 @@ (define (crawler-fetch/xexpr . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (match (http-rsp-code rsp) + ;; return an empty response for 403 and 404. the page won't be visited again + ;; because it will have been added to the complete set [(or 403 404) (http-close sock) '()] + ;; we shouldn't run into this one [400 (error "you screwed up, got error 400 :angery:" params)] + ;; normal response yay!! [200 (define xe (string->xexpr @@ -101,6 +110,7 @@ [flags (find-flags xe)] [page-links (map (lambda (x) (combine-url/relative page-url x)) (find-hrefs xe))]) + ;; only return URLs that are OK to crawl (values (filter crawler-valid-url? page-links) flags))) ;; --------------------------------------------------------------------------------------- @@ -108,32 +118,47 @@ (define NUM-TASKS 256) +;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags (define (crawler-worker-thread worker-id jobserver) (let loop () (define next-job (thread-receive)) + ;; jobserver sends a #f when it's time to exit (when next-job (define-values (new-urls flags) (crawler-iterate-one next-job)) (thread-send jobserver (list worker-id next-job new-urls flags)) (loop)))) +;; Starts up all the threads and schedules jobs until crawling is complete (define (crawler-jobserver root-url) + ;; Inner function to be run in its own thread (define (jobserver-thread) + ;; Create workers (define worker-threads (let ([parent (current-thread)]) (for/vector ([i (in-range NUM-TASKS)]) (thread (lambda () (crawler-worker-thread i parent)))))) + ;; Create worker status - #t means the worker is ready for the next task + ;; #f means the worker is currently busy (define worker-ready (make-vector NUM-TASKS #t)) + ;; URLs we have yet to crawl (define pending (mutable-set root-url)) + ;; URLs we have crawled already (define completed (mutable-set)) + ;; All flags seen so far (define all-flags (mutable-set)) + ;; Main loop (let loop () (define any-busy (for/or ([r (in-vector worker-ready)]) (not r))) + ;; Loop until there are no more busy workers and there are also no more pending URLs + ;; Then, we are done (unless (and (not any-busy) (set-empty? pending)) + ;; Find any available worker to give the next job to (define available-worker (for/first ([w (in-naturals)] [r (in-vector worker-ready)] #:when r) w)) + ;; If there is a worker and a job, assign the job (when (and (not (set-empty? pending)) available-worker) (define next-job (set-first pending)) (set-remove! pending next-job) @@ -141,9 +166,12 @@ (vector-set! worker-ready available-worker #f) (set! any-busy #t)) + ;; If there are no more jobs right now or there are no available workers, wait for one of + ;; the workers to complete its job (when (or (set-empty? pending) (false? available-worker)) (match-define (list worker-id job-url new-urls flags) (thread-receive)) (vector-set! worker-ready worker-id #t) + ;; Update our state with the results (set-add! completed job-url) (for ([u (in-list new-urls)]) (unless (set-member? completed u) @@ -152,6 +180,7 @@ (print-flag flag) (set-add! all-flags flag))) + ;; Useful debug status is printed in debug mode (print-progress (+ (set-count completed) (set-count pending)) (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1) (set-count completed) (set-count all-flags)) @@ -176,6 +205,7 @@ (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (crawler-jobserver base-url)) +;; Parse command line arguments and run (module+ main (command-line #:program "webcrawler"