CS3700-project4/webcrawler

115 lines
3.9 KiB
Plaintext
Raw Normal View History

2020-04-10 20:31:41 +00:00
#!/usr/bin/env racket
2020-04-10 23:12:50 +00:00
#lang racket
; vim: ft=racket
2020-04-11 04:32:34 +00:00
; __ __ __
; __/ // /_/ /___ _____ ____ _ __ ___ ____ __
; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
; /____/
(require net/url-string
smol-http
"private/util.rkt")
2020-04-11 03:17:20 +00:00
(define HOST "fring.ccs.neu.edu")
(define ROOT-PATH "/fakebook/")
(define LOGIN-PATH "/accounts/login/")
2020-04-10 23:23:14 +00:00
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
;; christo pls do not track thanks
(dnt . "1")))
;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f])
(define req
(let ([basic-req (make-http-req method path '())])
(if body (http-set-body basic-req body) basic-req)))
(define sock
(http-connect
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
DEFAULT-HDRS)))
(define rsp (http-request sock req))
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
(match (http-rsp-code rsp)
[(or 301 302)
;; handle redirects transparently
(define new-location (http-msg-header rsp 'location))
(http-close sock)
(crawler-fetch/noclose method new-location body)]
[500
;; handle server failure retries transparently
(crawler-fetch/noclose method path body)]
[_
;; other stuff like 403/404 up to caller
(values rsp sock)]))
2020-04-11 05:46:34 +00:00
;; Returns response for request, closing socket
(define (crawler-fetch . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
2020-04-11 05:46:34 +00:00
(http-close sock)
rsp)
;; Fetches request and tries to parse the result as xexpr
(define (crawler-fetch/xexpr . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
(match (http-rsp-code rsp)
[(or 403 404)
(http-close sock)
'()]
[200
(define xe
(string->xexpr
(bytes->string/utf-8
(for/fold ([res #""])
([chunk (in-http-body-chunks (http-msg-body rsp))])
(bytes-append res chunk)))))
(http-close sock)
xe]
[code (http-close sock) (error "unexpected response code" code)]))
2020-04-11 05:46:34 +00:00
;; Logs in with the given username and password
2020-04-11 05:46:34 +00:00
(define (crawler-login username password)
(crawler-fetch 'GET LOGIN-PATH)
2020-04-11 05:46:34 +00:00
(define form-body
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
(crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
2020-04-11 05:46:34 +00:00
(void))
;; Checks if this is a URL we should crawl
(define (crawler-valid-url? page-url)
(match page-url
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
[_ #f]))
;; Fetches one page, prints any flags, and returns a list of urls to continue with
;; (may contain duplicates)
(define (crawler-iterate-one page-url)
(let* ([path (string-join (map path/param-path (url-path page-url)) "/")]
[xe (crawler-fetch/xexpr 'GET path)]
[flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))])
(for ([flag (in-list flags)])
(print-flag flag))
(filter crawler-valid-url? page-links)))
2020-04-11 05:46:34 +00:00
(define (run-webcrawler username password)
(printf-debug "the credentials are: ~s ~s\n" username password)
(printf-debug "logging in...\n")
(crawler-login username password)
(printf-debug "logged in\n")
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(define next-pages (crawler-iterate-one base-url))
(displayln (map url->string next-pages)))
2020-04-10 23:12:50 +00:00
(module+ main
(command-line
#:program "webcrawler"
2020-04-11 00:13:04 +00:00
#:once-each
[("-d") "Debug mode" (debug-mode? #t)]
2020-04-10 23:12:50 +00:00
#:args
(username password)
(run-webcrawler username password)))