#!/usr/bin/env racket #lang racket ; vim: ft=racket ; __ __ __ ; __/ // /_/ /___ _____ ____ _ __ ___ ____ __ ; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / / ; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ / ; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ ; /____/ (require net/url-string smol-http "private/util.rkt") (define HOST "fring.ccs.neu.edu") (define ROOT-PATH "/fakebook/") (define LOGIN-PATH "/accounts/login/") (define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0") ;; christo pls do not track thanks (dnt . "1"))) ;; --------------------------------------------------------------------------------------- ;; HTTP fetch logic ;; Returns response for request without closing the socket (define (crawler-fetch/noclose method path [body #f]) ;; Create the request based on the given parameters (define req (let ([basic-req (make-http-req method path '())]) (if body (http-set-body basic-req body) basic-req))) ;; Do the thing, with the thing ;; Include cookie jar cookies (define sock (http-connect HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) DEFAULT-HDRS))) (define rsp (http-request sock req)) ;; If we got any new cookies (yum!!) add them to the jar (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) ;; Abstract over some response codes we can handle directly here (match (http-rsp-code rsp) [(or 301 302) ;; handle redirects transparently (define new-location (http-msg-header rsp 'location)) (http-close sock) (crawler-fetch/noclose method new-location body)] [500 ;; handle server failure retries transparently (crawler-fetch/noclose method path body)] [_ ;; other stuff like 403/404 up to caller (values rsp sock)])) ;; Returns response for request, closing socket (define (crawler-fetch . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (http-close sock) rsp) ;; Fetches request and tries to parse the result as xexpr (define (crawler-fetch/xexpr . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (match (http-rsp-code rsp) ;; return an empty response for 403 and 404. the page won't be visited again ;; because it will have been added to the complete set [(or 403 404) (http-close sock) '()] ;; we shouldn't run into this one [400 (error "you screwed up, got error 400 :angery:" params)] ;; normal response yay!! [200 (define xe (string->xexpr (bytes->string/utf-8 (for/fold ([res #""]) ([chunk (in-http-body-chunks (http-msg-body rsp))]) (bytes-append res chunk))))) (http-close sock) xe] [code (http-close sock) (error "unexpected response code" code)])) ;; --------------------------------------------------------------------------------------- ;; Utilities ;; Logs in with the given username and password (define (crawler-login username password) (crawler-fetch 'GET LOGIN-PATH) (define form-body (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next=" username password (cookie-jar-ref (current-cookie-jar) "csrftoken"))) (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body)) (void)) ;; Checks if this is a URL we should crawl (define (crawler-valid-url? page-url) (match page-url [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] [_ #f])) ;; --------------------------------------------------------------------------------------- ;; Main crawler iterator ;; Fetches one page, prints any flags, and returns a list of urls to continue with ;; (may contain duplicates) (define (crawler-iterate-one page-url) (let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))] [xe (crawler-fetch/xexpr 'GET path)] [flags (find-flags xe)] [page-links (map (lambda (x) (combine-url/relative page-url x)) (find-hrefs xe))]) ;; only return URLs that are OK to crawl (values (filter crawler-valid-url? page-links) flags))) ;; --------------------------------------------------------------------------------------- ;; Tasking logic (define NUM-TASKS 256) ;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags (define (crawler-worker-thread worker-id jobserver) (let loop () (define next-job (thread-receive)) ;; jobserver sends a #f when it's time to exit (when next-job (define-values (new-urls flags) (crawler-iterate-one next-job)) (thread-send jobserver (list worker-id next-job new-urls flags)) (loop)))) ;; Starts up all the threads and schedules jobs until crawling is complete (define (crawler-jobserver root-url) ;; Inner function to be run in its own thread (define (jobserver-thread) ;; Create workers (define worker-threads (let ([parent (current-thread)]) (for/vector ([i (in-range NUM-TASKS)]) (thread (lambda () (crawler-worker-thread i parent)))))) ;; Create worker status - #t means the worker is ready for the next task ;; #f means the worker is currently busy (define worker-ready (make-vector NUM-TASKS #t)) ;; URLs we have yet to crawl (define pending (mutable-set root-url)) ;; URLs we have crawled already (define completed (mutable-set)) ;; All flags seen so far (define all-flags (mutable-set)) ;; Main loop (let loop () (define any-busy (for/or ([r (in-vector worker-ready)]) (not r))) ;; Loop until there are no more busy workers and there are also no more pending URLs ;; Then, we are done (unless (and (not any-busy) (set-empty? pending)) ;; Find any available worker to give the next job to (define available-worker (for/first ([w (in-naturals)] [r (in-vector worker-ready)] #:when r) w)) ;; If there is a worker and a job, assign the job (when (and (not (set-empty? pending)) available-worker) (define next-job (set-first pending)) (set-remove! pending next-job) (thread-send (vector-ref worker-threads available-worker) next-job) (vector-set! worker-ready available-worker #f) (set! any-busy #t)) ;; If there are no more jobs right now or there are no available workers, wait for one of ;; the workers to complete its job (when (or (set-empty? pending) (false? available-worker)) (match-define (list worker-id job-url new-urls flags) (thread-receive)) (vector-set! worker-ready worker-id #t) ;; Update our state with the results (set-add! completed job-url) (for ([u (in-list new-urls)]) (unless (set-member? completed u) (set-add! pending u))) (for ([flag (in-list flags)] #:when (not (set-member? all-flags flag))) (print-flag flag) (set-add! all-flags flag))) ;; Useful debug status is printed in debug mode (print-progress (+ (set-count completed) (set-count pending)) (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1) (set-count completed) (set-count all-flags)) (loop))) (print-complete (set-count completed) (set-count all-flags)) ;; send all workers the shutdown message and wait (for ([thd (in-vector worker-threads)]) (thread-send thd #f) (thread-wait thd))) ;; start a new thread so we get a unique mailbox (thread-wait (thread jobserver-thread))) ;; --------------------------------------------------------------------------------------- ;; public static void main open paren string square brackets args (define (run-webcrawler username password) (printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "logging in...\n") (crawler-login username password) (printf-debug "logged in. Starting crawl\n") (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (crawler-jobserver base-url)) ;; Parse command line arguments and run (module+ main (command-line #:program "webcrawler" #:once-each [("-d") "Debug mode" (debug-mode? #t)] #:args (username password) (run-webcrawler username password)))