#!/usr/bin/env racket #lang racket ; vim: ft=racket ; __ __ __ ; __/ // /_/ /___ _____ ____ _ __ ___ ____ __ ; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / / ; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ / ; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ ; /____/ (require net/url-string smol-http "private/util.rkt") (define HOST "fring.ccs.neu.edu") (define ROOT-PATH "/fakebook/") (define LOGIN-PATH "/accounts/login/") (define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0") ;; christo pls do not track thanks (dnt . "1"))) ;; --------------------------------------------------------------------------------------- ;; HTTP fetch logic ;; Returns response for request without closing the socket (define (crawler-fetch/noclose method path [body #f]) (define req (let ([basic-req (make-http-req method path '())]) (if body (http-set-body basic-req body) basic-req))) (define sock (http-connect HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) DEFAULT-HDRS))) (define rsp (http-request sock req)) (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) (match (http-rsp-code rsp) [(or 301 302) ;; handle redirects transparently (define new-location (http-msg-header rsp 'location)) (http-close sock) (crawler-fetch/noclose method new-location body)] [500 ;; handle server failure retries transparently (crawler-fetch/noclose method path body)] [_ ;; other stuff like 403/404 up to caller (values rsp sock)])) ;; Returns response for request, closing socket (define (crawler-fetch . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (http-close sock) rsp) ;; Fetches request and tries to parse the result as xexpr (define (crawler-fetch/xexpr . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (match (http-rsp-code rsp) [(or 403 404) (http-close sock) '()] [400 (error "you screwed up, got error 400 :angery:" params)] [200 (define xe (string->xexpr (bytes->string/utf-8 (for/fold ([res #""]) ([chunk (in-http-body-chunks (http-msg-body rsp))]) (bytes-append res chunk))))) (http-close sock) xe] [code (http-close sock) (error "unexpected response code" code)])) ;; --------------------------------------------------------------------------------------- ;; Utilities ;; Logs in with the given username and password (define (crawler-login username password) (crawler-fetch 'GET LOGIN-PATH) (define form-body (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next=" username password (cookie-jar-ref (current-cookie-jar) "csrftoken"))) (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body)) (void)) ;; Checks if this is a URL we should crawl (define (crawler-valid-url? page-url) (match page-url [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] [_ #f])) ;; --------------------------------------------------------------------------------------- ;; Main crawler iterator ;; Fetches one page, prints any flags, and returns a list of urls to continue with ;; (may contain duplicates) (define (crawler-iterate-one page-url) (let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))] [xe (crawler-fetch/xexpr 'GET path)] [flags (find-flags xe)] [page-links (map (lambda (x) (combine-url/relative page-url x)) (find-hrefs xe))]) (values (filter crawler-valid-url? page-links) flags))) ;; --------------------------------------------------------------------------------------- ;; Tasking logic (define NUM-TASKS 256) (define (crawler-worker-thread worker-id jobserver) (let loop () (define next-job (thread-receive)) (when next-job (define-values (new-urls flags) (crawler-iterate-one next-job)) (thread-send jobserver (list worker-id next-job new-urls flags)) (loop)))) (define (crawler-jobserver root-url) (define (jobserver-thread) (define worker-threads (let ([parent (current-thread)]) (for/vector ([i (in-range NUM-TASKS)]) (thread (lambda () (crawler-worker-thread i parent)))))) (define worker-ready (make-vector NUM-TASKS #t)) (define pending (mutable-set root-url)) (define completed (mutable-set)) (define num-flags (box 0)) (let loop () (define any-busy (for/or ([r (in-vector worker-ready)]) (not r))) (unless (and (not any-busy) (set-empty? pending)) (define available-worker (for/first ([w (in-naturals)] [r (in-vector worker-ready)] #:when r) w)) (when (and (not (set-empty? pending)) available-worker) (define next-job (set-first pending)) (set-remove! pending next-job) (thread-send (vector-ref worker-threads available-worker) next-job) (vector-set! worker-ready available-worker #f) (set! any-busy #t)) (when (or (set-empty? pending) (false? available-worker)) (match-define (list worker-id job-url new-urls flags) (thread-receive)) (vector-set! worker-ready worker-id #t) (set-add! completed job-url) (for ([u (in-list new-urls)]) (unless (set-member? completed u) (set-add! pending u))) (for ([flag (in-list flags)]) (print-flag flag) (set-box! num-flags (add1 (unbox num-flags))))) (print-progress (+ (set-count completed) (set-count pending)) (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1) (set-count completed) (unbox num-flags)) (loop))) (print-complete) ;; send all workers the shutdown message and wait (for ([thd (in-vector worker-threads)]) (thread-send thd #f) (thread-wait thd))) ;; start a new thread so we get a unique mailbox (thread-wait (thread jobserver-thread))) ;; --------------------------------------------------------------------------------------- ;; public static void main open paren string square brackets args (define (run-webcrawler username password) (printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "logging in...\n") (crawler-login username password) (printf-debug "logged in. Starting crawl\n") (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (crawler-jobserver base-url)) (module+ main (command-line #:program "webcrawler" #:once-each [("-d") "Debug mode" (debug-mode? #t)] #:args (username password) (run-webcrawler username password)))