#!/usr/bin/env racket #lang racket ; vim: ft=racket ; __ __ __ ; __/ // /_/ /___ _____ ____ _ __ ___ ____ __ ; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / / ; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ / ; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ ; /____/ (require net/url-string smol-http "private/util.rkt") (define HOST "fring.ccs.neu.edu") (define ROOT-PATH "/fakebook/") (define LOGIN-PATH "/accounts/login/") (define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0") ;; christo pls do not track thanks (dnt . "1"))) ;; Returns response for request without closing the socket (define (crawler-fetch/noclose method path [body #f]) (define req (let ([basic-req (make-http-req method path '())]) (if body (http-set-body basic-req body) basic-req))) (define sock (http-connect HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) DEFAULT-HDRS))) (define rsp (http-request sock req)) (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) (match (http-rsp-code rsp) [(or 301 302) ;; handle redirects transparently (define new-location (http-msg-header rsp 'location)) (http-close sock) (crawler-fetch/noclose method new-location body)] [500 ;; handle server failure retries transparently (crawler-fetch/noclose method path body)] [_ ;; other stuff like 403/404 up to caller (values rsp sock)])) ;; Returns response for request, closing socket (define (crawler-fetch . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (http-close sock) rsp) ;; Fetches request and tries to parse the result as xexpr (define (crawler-fetch/xexpr . params) (define-values [rsp sock] (apply crawler-fetch/noclose params)) (match (http-rsp-code rsp) [(or 403 404) (http-close sock) '()] [200 (define xe (string->xexpr (bytes->string/utf-8 (for/fold ([res #""]) ([chunk (in-http-body-chunks (http-msg-body rsp))]) (bytes-append res chunk))))) (http-close sock) xe] [code (http-close sock) (error "unexpected response code" code)])) ;; Logs in with the given username and password (define (crawler-login username password) (crawler-fetch 'GET LOGIN-PATH) (define form-body (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next=" username password (cookie-jar-ref (current-cookie-jar) "csrftoken"))) (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body)) (void)) ;; Checks if this is a URL we should crawl (define (crawler-valid-url? page-url) (match page-url [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] [_ #f])) ;; Fetches one page, prints any flags, and returns a list of urls to continue with ;; (may contain duplicates) (define (crawler-iterate-one page-url) (let* ([path (string-join (map path/param-path (url-path page-url)) "/")] [xe (crawler-fetch/xexpr 'GET path)] [flags (find-flags xe)] [page-links (map (lambda (x) (combine-url/relative page-url x)) (find-hrefs xe))]) (for ([flag (in-list flags)]) (print-flag flag)) (filter crawler-valid-url? page-links))) (define (run-webcrawler username password) (printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "logging in...\n") (crawler-login username password) (printf-debug "logged in\n") (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (define next-pages (crawler-iterate-one base-url)) (displayln (map url->string next-pages))) (module+ main (command-line #:program "webcrawler" #:once-each [("-d") "Debug mode" (debug-mode? #t)] #:args (username password) (run-webcrawler username password)))