diff --git a/private/util.rkt b/private/util.rkt index b196d62..b112452 100644 --- a/private/util.rkt +++ b/private/util.rkt @@ -2,7 +2,7 @@ (require xml) -(provide debug-mode? printf-debug print-progress print-complete +(provide debug-mode? printf-debug print-progress print-complete print-flag current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref string->xexpr find-flags find-hrefs) @@ -33,6 +33,12 @@ (when (debug-mode?) (printf "\r\x1b[KCrawl complete\n"))) +;; Str -> +;; Prints a flag +(define (print-flag flag) + (if (debug-mode?) + (printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess + (displayln flag))) ;; Cookie jar (struct cj [jar cache sema] #:transparent) diff --git a/smol-http/http-msg.rkt b/smol-http/http-msg.rkt index e197a7e..4a8ef5e 100644 --- a/smol-http/http-msg.rkt +++ b/smol-http/http-msg.rkt @@ -3,6 +3,7 @@ http-msg-headers http-msg-header http-msg-body + http-rsp-code write-http-msg read-http-msg http-add-headers @@ -62,6 +63,11 @@ [(assoc k (http-msg-headers msg)) => cdr] [else #f])) +;; (http-rsp-code msg) -> http-response-code? +;; msg : http-rsp? +(define (http-rsp-code msg) + (http-start-line:res-code (http-msg-start-line msg))) + ;; (http-add-headers msg hdrs) -> http-msg? ;; msg : http-msg? ;; hdrs : (listof (cons/c symbol? any?)) diff --git a/webcrawler b/webcrawler index a539134..09d0a74 100755 --- a/webcrawler +++ b/webcrawler @@ -9,7 +9,8 @@ ; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ ; /____/ -(require smol-http +(require net/url-string + smol-http "private/util.rkt") (define HOST "fring.ccs.neu.edu") @@ -19,46 +20,89 @@ ;; christo pls do not track thanks (dnt . "1"))) -(define (crawler-fetch/noclose req) - (define sock (http-connect HOST - #:headers (cons (cookie-jar->header (current-cookie-jar)) - DEFAULT-HDRS))) +;; Returns response for request without closing the socket +(define (crawler-fetch/noclose method path [body #f]) + (define req + (let ([basic-req (make-http-req method path '())]) + (if body (http-set-body basic-req body) basic-req))) + (define sock + (http-connect + HOST #:headers (cons (cookie-jar->header (current-cookie-jar)) + DEFAULT-HDRS))) (define rsp (http-request sock req)) (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) - (values rsp sock)) + (match (http-rsp-code rsp) + [(or 301 302) + ;; handle redirects transparently + (define new-location (http-msg-header rsp 'location)) + (http-close sock) + (crawler-fetch/noclose method new-location body)] + [500 + ;; handle server failure retries transparently + (crawler-fetch/noclose method path body)] + [_ + ;; other stuff like 403/404 up to caller + (values rsp sock)])) -(define (crawler-fetch req) - (define-values [rsp sock] (crawler-fetch/noclose req)) +;; Returns response for request, closing socket +(define (crawler-fetch . params) + (define-values [rsp sock] (apply crawler-fetch/noclose params)) (http-close sock) rsp) -(define (crawler-fetch/xexpr req) - (define-values [rsp sock] (crawler-fetch/noclose req)) - (define xe (string->xexpr - (bytes->string/utf-8 - (for/fold ([res #""]) - ([chunk (in-http-body-chunks (http-msg-body rsp))]) - (bytes-append res chunk))))) - (http-close sock) - xe) +;; Fetches request and tries to parse the result as xexpr +(define (crawler-fetch/xexpr . params) + (define-values [rsp sock] (apply crawler-fetch/noclose params)) + (match (http-rsp-code rsp) + [(or 403 404) + (http-close sock) + '()] + [200 + (define xe + (string->xexpr + (bytes->string/utf-8 + (for/fold ([res #""]) + ([chunk (in-http-body-chunks (http-msg-body rsp))]) + (bytes-append res chunk))))) + (http-close sock) + xe] + [code (http-close sock) (error "unexpected response code" code)])) +;; Logs in with the given username and password (define (crawler-login username password) - (crawler-fetch (make-http-req 'GET LOGIN-PATH)) + (crawler-fetch 'GET LOGIN-PATH) (define form-body (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next=" username password (cookie-jar-ref (current-cookie-jar) "csrftoken"))) - (crawler-fetch (http-set-body (make-http-req 'POST LOGIN-PATH) - (string->bytes/utf-8 form-body))) + (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body)) (void)) +;; Checks if this is a URL we should crawl +(define (crawler-valid-url? page-url) + (match page-url + [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] + [_ #f])) + +;; Fetches one page, prints any flags, and returns a list of urls to continue with +;; (may contain duplicates) +(define (crawler-iterate-one page-url) + (let* ([path (string-join (map path/param-path (url-path page-url)) "/")] + [xe (crawler-fetch/xexpr 'GET path)] + [flags (find-flags xe)] + [page-links (map (lambda (x) (combine-url/relative page-url x)) + (find-hrefs xe))]) + (for ([flag (in-list flags)]) + (print-flag flag)) + (filter crawler-valid-url? page-links))) + (define (run-webcrawler username password) (printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "logging in...\n") (crawler-login username password) (printf-debug "logged in\n") - (define xe (crawler-fetch/xexpr (make-http-req 'GET ROOT-PATH))) - (displayln (find-flags xe)) - (displayln (find-hrefs xe))) + (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) + (define next-pages (crawler-iterate-one base-url)) + (displayln (map url->string next-pages))) (module+ main (command-line