Add crawler handling for correct urls, response codes 301, 302, 403, 404, 500

This commit is contained in:
xenia 2020-04-11 02:43:12 -04:00
parent e39b9addfe
commit dabf565d2b
3 changed files with 80 additions and 24 deletions

View File

@ -2,7 +2,7 @@
(require xml) (require xml)
(provide debug-mode? printf-debug print-progress print-complete (provide debug-mode? printf-debug print-progress print-complete print-flag
current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref
string->xexpr find-flags find-hrefs) string->xexpr find-flags find-hrefs)
@ -33,6 +33,12 @@
(when (debug-mode?) (when (debug-mode?)
(printf "\r\x1b[KCrawl complete\n"))) (printf "\r\x1b[KCrawl complete\n")))
;; Str ->
;; Prints a flag
(define (print-flag flag)
(if (debug-mode?)
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess
(displayln flag)))
;; Cookie jar ;; Cookie jar
(struct cj [jar cache sema] #:transparent) (struct cj [jar cache sema] #:transparent)

View File

@ -3,6 +3,7 @@
http-msg-headers http-msg-headers
http-msg-header http-msg-header
http-msg-body http-msg-body
http-rsp-code
write-http-msg write-http-msg
read-http-msg read-http-msg
http-add-headers http-add-headers
@ -62,6 +63,11 @@
[(assoc k (http-msg-headers msg)) => cdr] [(assoc k (http-msg-headers msg)) => cdr]
[else #f])) [else #f]))
;; (http-rsp-code msg) -> http-response-code?
;; msg : http-rsp?
(define (http-rsp-code msg)
(http-start-line:res-code (http-msg-start-line msg)))
;; (http-add-headers msg hdrs) -> http-msg? ;; (http-add-headers msg hdrs) -> http-msg?
;; msg : http-msg? ;; msg : http-msg?
;; hdrs : (listof (cons/c symbol? any?)) ;; hdrs : (listof (cons/c symbol? any?))

View File

@ -9,7 +9,8 @@
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ ; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
; /____/ ; /____/
(require smol-http (require net/url-string
smol-http
"private/util.rkt") "private/util.rkt")
(define HOST "fring.ccs.neu.edu") (define HOST "fring.ccs.neu.edu")
@ -19,46 +20,89 @@
;; christo pls do not track thanks ;; christo pls do not track thanks
(dnt . "1"))) (dnt . "1")))
(define (crawler-fetch/noclose req) ;; Returns response for request without closing the socket
(define sock (http-connect HOST (define (crawler-fetch/noclose method path [body #f])
#:headers (cons (cookie-jar->header (current-cookie-jar)) (define req
(let ([basic-req (make-http-req method path '())])
(if body (http-set-body basic-req body) basic-req)))
(define sock
(http-connect
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
DEFAULT-HDRS))) DEFAULT-HDRS)))
(define rsp (http-request sock req)) (define rsp (http-request sock req))
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp)) (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
(values rsp sock)) (match (http-rsp-code rsp)
[(or 301 302)
;; handle redirects transparently
(define new-location (http-msg-header rsp 'location))
(http-close sock)
(crawler-fetch/noclose method new-location body)]
[500
;; handle server failure retries transparently
(crawler-fetch/noclose method path body)]
[_
;; other stuff like 403/404 up to caller
(values rsp sock)]))
(define (crawler-fetch req) ;; Returns response for request, closing socket
(define-values [rsp sock] (crawler-fetch/noclose req)) (define (crawler-fetch . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
(http-close sock) (http-close sock)
rsp) rsp)
(define (crawler-fetch/xexpr req) ;; Fetches request and tries to parse the result as xexpr
(define-values [rsp sock] (crawler-fetch/noclose req)) (define (crawler-fetch/xexpr . params)
(define xe (string->xexpr (define-values [rsp sock] (apply crawler-fetch/noclose params))
(match (http-rsp-code rsp)
[(or 403 404)
(http-close sock)
'()]
[200
(define xe
(string->xexpr
(bytes->string/utf-8 (bytes->string/utf-8
(for/fold ([res #""]) (for/fold ([res #""])
([chunk (in-http-body-chunks (http-msg-body rsp))]) ([chunk (in-http-body-chunks (http-msg-body rsp))])
(bytes-append res chunk))))) (bytes-append res chunk)))))
(http-close sock) (http-close sock)
xe) xe]
[code (http-close sock) (error "unexpected response code" code)]))
;; Logs in with the given username and password
(define (crawler-login username password) (define (crawler-login username password)
(crawler-fetch (make-http-req 'GET LOGIN-PATH)) (crawler-fetch 'GET LOGIN-PATH)
(define form-body (define form-body
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next=" (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
username password (cookie-jar-ref (current-cookie-jar) "csrftoken"))) username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
(crawler-fetch (http-set-body (make-http-req 'POST LOGIN-PATH) (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
(string->bytes/utf-8 form-body)))
(void)) (void))
;; Checks if this is a URL we should crawl
(define (crawler-valid-url? page-url)
(match page-url
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
[_ #f]))
;; Fetches one page, prints any flags, and returns a list of urls to continue with
;; (may contain duplicates)
(define (crawler-iterate-one page-url)
(let* ([path (string-join (map path/param-path (url-path page-url)) "/")]
[xe (crawler-fetch/xexpr 'GET path)]
[flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))])
(for ([flag (in-list flags)])
(print-flag flag))
(filter crawler-valid-url? page-links)))
(define (run-webcrawler username password) (define (run-webcrawler username password)
(printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "the credentials are: ~s ~s\n" username password)
(printf-debug "logging in...\n") (printf-debug "logging in...\n")
(crawler-login username password) (crawler-login username password)
(printf-debug "logged in\n") (printf-debug "logged in\n")
(define xe (crawler-fetch/xexpr (make-http-req 'GET ROOT-PATH))) (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(displayln (find-flags xe)) (define next-pages (crawler-iterate-one base-url))
(displayln (find-hrefs xe))) (displayln (map url->string next-pages)))
(module+ main (module+ main
(command-line (command-line