Add crawler handling for correct urls, response codes 301, 302, 403, 404, 500
This commit is contained in:
parent
e39b9addfe
commit
dabf565d2b
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
(require xml)
|
(require xml)
|
||||||
|
|
||||||
(provide debug-mode? printf-debug print-progress print-complete
|
(provide debug-mode? printf-debug print-progress print-complete print-flag
|
||||||
current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref
|
current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref
|
||||||
string->xexpr find-flags find-hrefs)
|
string->xexpr find-flags find-hrefs)
|
||||||
|
|
||||||
|
@ -33,6 +33,12 @@
|
||||||
(when (debug-mode?)
|
(when (debug-mode?)
|
||||||
(printf "\r\x1b[KCrawl complete\n")))
|
(printf "\r\x1b[KCrawl complete\n")))
|
||||||
|
|
||||||
|
;; Str ->
|
||||||
|
;; Prints a flag
|
||||||
|
(define (print-flag flag)
|
||||||
|
(if (debug-mode?)
|
||||||
|
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess
|
||||||
|
(displayln flag)))
|
||||||
|
|
||||||
;; Cookie jar
|
;; Cookie jar
|
||||||
(struct cj [jar cache sema] #:transparent)
|
(struct cj [jar cache sema] #:transparent)
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
http-msg-headers
|
http-msg-headers
|
||||||
http-msg-header
|
http-msg-header
|
||||||
http-msg-body
|
http-msg-body
|
||||||
|
http-rsp-code
|
||||||
write-http-msg
|
write-http-msg
|
||||||
read-http-msg
|
read-http-msg
|
||||||
http-add-headers
|
http-add-headers
|
||||||
|
@ -62,6 +63,11 @@
|
||||||
[(assoc k (http-msg-headers msg)) => cdr]
|
[(assoc k (http-msg-headers msg)) => cdr]
|
||||||
[else #f]))
|
[else #f]))
|
||||||
|
|
||||||
|
;; (http-rsp-code msg) -> http-response-code?
|
||||||
|
;; msg : http-rsp?
|
||||||
|
(define (http-rsp-code msg)
|
||||||
|
(http-start-line:res-code (http-msg-start-line msg)))
|
||||||
|
|
||||||
;; (http-add-headers msg hdrs) -> http-msg?
|
;; (http-add-headers msg hdrs) -> http-msg?
|
||||||
;; msg : http-msg?
|
;; msg : http-msg?
|
||||||
;; hdrs : (listof (cons/c symbol? any?))
|
;; hdrs : (listof (cons/c symbol? any?))
|
||||||
|
|
90
webcrawler
90
webcrawler
|
@ -9,7 +9,8 @@
|
||||||
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
||||||
; /____/
|
; /____/
|
||||||
|
|
||||||
(require smol-http
|
(require net/url-string
|
||||||
|
smol-http
|
||||||
"private/util.rkt")
|
"private/util.rkt")
|
||||||
|
|
||||||
(define HOST "fring.ccs.neu.edu")
|
(define HOST "fring.ccs.neu.edu")
|
||||||
|
@ -19,46 +20,89 @@
|
||||||
;; christo pls do not track thanks
|
;; christo pls do not track thanks
|
||||||
(dnt . "1")))
|
(dnt . "1")))
|
||||||
|
|
||||||
(define (crawler-fetch/noclose req)
|
;; Returns response for request without closing the socket
|
||||||
(define sock (http-connect HOST
|
(define (crawler-fetch/noclose method path [body #f])
|
||||||
#:headers (cons (cookie-jar->header (current-cookie-jar))
|
(define req
|
||||||
DEFAULT-HDRS)))
|
(let ([basic-req (make-http-req method path '())])
|
||||||
|
(if body (http-set-body basic-req body) basic-req)))
|
||||||
|
(define sock
|
||||||
|
(http-connect
|
||||||
|
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
||||||
|
DEFAULT-HDRS)))
|
||||||
(define rsp (http-request sock req))
|
(define rsp (http-request sock req))
|
||||||
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
||||||
(values rsp sock))
|
(match (http-rsp-code rsp)
|
||||||
|
[(or 301 302)
|
||||||
|
;; handle redirects transparently
|
||||||
|
(define new-location (http-msg-header rsp 'location))
|
||||||
|
(http-close sock)
|
||||||
|
(crawler-fetch/noclose method new-location body)]
|
||||||
|
[500
|
||||||
|
;; handle server failure retries transparently
|
||||||
|
(crawler-fetch/noclose method path body)]
|
||||||
|
[_
|
||||||
|
;; other stuff like 403/404 up to caller
|
||||||
|
(values rsp sock)]))
|
||||||
|
|
||||||
(define (crawler-fetch req)
|
;; Returns response for request, closing socket
|
||||||
(define-values [rsp sock] (crawler-fetch/noclose req))
|
(define (crawler-fetch . params)
|
||||||
|
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||||
(http-close sock)
|
(http-close sock)
|
||||||
rsp)
|
rsp)
|
||||||
|
|
||||||
(define (crawler-fetch/xexpr req)
|
;; Fetches request and tries to parse the result as xexpr
|
||||||
(define-values [rsp sock] (crawler-fetch/noclose req))
|
(define (crawler-fetch/xexpr . params)
|
||||||
(define xe (string->xexpr
|
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||||
(bytes->string/utf-8
|
(match (http-rsp-code rsp)
|
||||||
(for/fold ([res #""])
|
[(or 403 404)
|
||||||
([chunk (in-http-body-chunks (http-msg-body rsp))])
|
(http-close sock)
|
||||||
(bytes-append res chunk)))))
|
'()]
|
||||||
(http-close sock)
|
[200
|
||||||
xe)
|
(define xe
|
||||||
|
(string->xexpr
|
||||||
|
(bytes->string/utf-8
|
||||||
|
(for/fold ([res #""])
|
||||||
|
([chunk (in-http-body-chunks (http-msg-body rsp))])
|
||||||
|
(bytes-append res chunk)))))
|
||||||
|
(http-close sock)
|
||||||
|
xe]
|
||||||
|
[code (http-close sock) (error "unexpected response code" code)]))
|
||||||
|
|
||||||
|
;; Logs in with the given username and password
|
||||||
(define (crawler-login username password)
|
(define (crawler-login username password)
|
||||||
(crawler-fetch (make-http-req 'GET LOGIN-PATH))
|
(crawler-fetch 'GET LOGIN-PATH)
|
||||||
(define form-body
|
(define form-body
|
||||||
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
|
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
|
||||||
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
|
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
|
||||||
(crawler-fetch (http-set-body (make-http-req 'POST LOGIN-PATH)
|
(crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
|
||||||
(string->bytes/utf-8 form-body)))
|
|
||||||
(void))
|
(void))
|
||||||
|
|
||||||
|
;; Checks if this is a URL we should crawl
|
||||||
|
(define (crawler-valid-url? page-url)
|
||||||
|
(match page-url
|
||||||
|
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
|
||||||
|
[_ #f]))
|
||||||
|
|
||||||
|
;; Fetches one page, prints any flags, and returns a list of urls to continue with
|
||||||
|
;; (may contain duplicates)
|
||||||
|
(define (crawler-iterate-one page-url)
|
||||||
|
(let* ([path (string-join (map path/param-path (url-path page-url)) "/")]
|
||||||
|
[xe (crawler-fetch/xexpr 'GET path)]
|
||||||
|
[flags (find-flags xe)]
|
||||||
|
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
||||||
|
(find-hrefs xe))])
|
||||||
|
(for ([flag (in-list flags)])
|
||||||
|
(print-flag flag))
|
||||||
|
(filter crawler-valid-url? page-links)))
|
||||||
|
|
||||||
(define (run-webcrawler username password)
|
(define (run-webcrawler username password)
|
||||||
(printf-debug "the credentials are: ~s ~s\n" username password)
|
(printf-debug "the credentials are: ~s ~s\n" username password)
|
||||||
(printf-debug "logging in...\n")
|
(printf-debug "logging in...\n")
|
||||||
(crawler-login username password)
|
(crawler-login username password)
|
||||||
(printf-debug "logged in\n")
|
(printf-debug "logged in\n")
|
||||||
(define xe (crawler-fetch/xexpr (make-http-req 'GET ROOT-PATH)))
|
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
||||||
(displayln (find-flags xe))
|
(define next-pages (crawler-iterate-one base-url))
|
||||||
(displayln (find-hrefs xe)))
|
(displayln (map url->string next-pages)))
|
||||||
|
|
||||||
(module+ main
|
(module+ main
|
||||||
(command-line
|
(command-line
|
||||||
|
|
Loading…
Reference in New Issue