Compare commits
1 Commits
master
...
force-http
Author | SHA1 | Date |
---|---|---|
|
77dfb51d14 |
26
Makefile
26
Makefile
|
@ -1,12 +1,4 @@
|
|||
.PHONY: all test run clean
|
||||
|
||||
SOURCES=Makefile README.md private/util.rkt secret_flags smol-http/http-msg.rkt smol-http/info.rkt smol-http/main.rkt smol-http/util.rkt webcrawler
|
||||
CCIS_TEAM_NAME=hash-lang-uwu
|
||||
CCIS_PROJECT_NAME=project4
|
||||
CCIS_SERVER=login.ccs.neu.edu
|
||||
CCIS_PATH=CS3700/$(CCIS_PROJECT_NAME)/
|
||||
CCIS_FMT_CHECK=/course/cs3700sp20/code/project4/project4_format_check.py
|
||||
CCIS_TURNIN=/course/cs3700sp20/bin/turnin
|
||||
.PHONY: all test run
|
||||
|
||||
all: .setup
|
||||
raco setup smol-http
|
||||
|
@ -22,20 +14,8 @@ endif
|
|||
ifndef FB_PASSWORD
|
||||
$(error "Provide FB_PASSWORD variable")
|
||||
endif
|
||||
./webcrawler -d $(FB_USERNAME) $(FB_PASSWORD)
|
||||
./webcrawler $(FB_USERNAME) $(FB_PASSWORD)
|
||||
|
||||
.setup:
|
||||
-raco pkg install smol-http/
|
||||
raco pkg install smol-http/
|
||||
@touch $@
|
||||
|
||||
upload: $(SOURCES)
|
||||
@rsync -avzzR --progress $^ $(CCIS_SERVER):$(CCIS_PATH)
|
||||
@ssh $(CCIS_SERVER) -t -- "tput bold; $(CCIS_FMT_CHECK) $(CCIS_PATH); tput sgr0"
|
||||
|
||||
submit: upload
|
||||
ssh $(CCIS_SERVER) -t -- "cd $(CCIS_PATH); make clean; $(CCIS_TURNIN) $(CCIS_PROJECT_NAME) ."
|
||||
|
||||
clean:
|
||||
$(RM) .setup
|
||||
find . \( -iname '*.zo' -o -iname '*.dep' \) -delete
|
||||
find . -type d -iname compiled -delete
|
||||
|
|
31
README.md
31
README.md
|
@ -1,31 +0,0 @@
|
|||
|
||||
__ __ __
|
||||
__/ // /_/ /___ _____ ____ _ __ ___ ____ __
|
||||
/_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
|
||||
/_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
|
||||
/_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
||||
/____/
|
||||
|
||||
## High level approach
|
||||
|
||||
We started by creating robust abstracted HTTP-handling code, which is located in the `smol-http`
|
||||
module of this project. The HTTP code implements a subset of HTTP 1.1 which is enough to meet the
|
||||
requirements for crawling the target web server. It also uses plain TCP sockets to communicate using
|
||||
its HTTP implementation. We used Racket standard library functions to parse and manipulate URLs as
|
||||
well as parse HTML (as XML, hopefully it's well-formed!) in order to find the hyperlinks on the page
|
||||
as well as the flags. We implemented a high performance Certified Web Scale(tm) crawling scheduler
|
||||
with a distributed work queue to allow for very high rate crawling, the crawler on our machines
|
||||
takes minutes to complete, and finds all the flags very quickly.
|
||||
|
||||
## Challenges
|
||||
|
||||
The current pandemic situation continues to make this semester difficult. Otherwise, we didn't run
|
||||
into any major issues during this project.
|
||||
|
||||
## Testing
|
||||
|
||||
We unit tested the HTTP handling code in smol-http, and used ad-hoc manual testing against the
|
||||
target server to test the complete crawling functionality.
|
||||
|
||||
We have an additional `-d` flag which will print useful debug info during the execution of the
|
||||
crawler, which may be helpful for manual testing.
|
121
private/util.rkt
121
private/util.rkt
|
@ -1,121 +0,0 @@
|
|||
#lang racket
|
||||
|
||||
(require xml)
|
||||
|
||||
(provide debug-mode? printf-debug print-progress print-complete print-flag
|
||||
current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref
|
||||
string->xexpr find-flags find-hrefs)
|
||||
|
||||
;; scheme is bad and it should feel bad
|
||||
(define member? member)
|
||||
|
||||
;; Debug prints
|
||||
|
||||
;; Parameter for debug mode
|
||||
(define debug-mode? (make-parameter #f))
|
||||
|
||||
;; ... ->
|
||||
;; Prints the arguments as with printf, but only when debug mode is on
|
||||
(define (printf-debug . args)
|
||||
(when (debug-mode?) (apply printf args)))
|
||||
|
||||
;; Int Int Int Int ->
|
||||
;; Prints progress to the console, only when debug mode is on
|
||||
(define (print-progress total-pages in-flight crawled-pages num-flags)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a"
|
||||
crawled-pages total-pages in-flight num-flags)
|
||||
(flush-output (current-output-port))))
|
||||
|
||||
;; ->
|
||||
;; Prints a completion message to the console, only when debug mode is on
|
||||
(define (print-complete total-pages num-flags)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KCrawl complete: ~a pages crawled, ~a flags found\n"
|
||||
total-pages num-flags)))
|
||||
|
||||
;; Str ->
|
||||
;; Prints a flag
|
||||
(define (print-flag flag)
|
||||
(if (debug-mode?)
|
||||
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progress
|
||||
(displayln flag)))
|
||||
|
||||
;; Cookie jar
|
||||
(struct cj [jar cache sema] #:transparent)
|
||||
|
||||
;; -> CookieJar
|
||||
;; Creates empty cookie jar
|
||||
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
|
||||
;; modification
|
||||
(define (make-cookie-jar)
|
||||
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
||||
|
||||
;; Parameter for current cookie jar
|
||||
(define current-cookie-jar (make-parameter (make-cookie-jar)))
|
||||
|
||||
;; CookieJar -> String
|
||||
;; Gets a value from a cookie jar
|
||||
(define (cookie-jar-ref jar key)
|
||||
(call-with-semaphore
|
||||
(cj-sema jar)
|
||||
(lambda () (hash-ref (cj-jar jar) key))))
|
||||
|
||||
;; CookieJar [List-of (cons Symbol String)] ->
|
||||
;; For a response with the given headers, finds any set-cookie headers and updates the jar
|
||||
(define (update-cookie-jar! jar hdrs)
|
||||
(call-with-semaphore
|
||||
(cj-sema jar)
|
||||
(lambda ()
|
||||
(for ([hdr (in-list hdrs)])
|
||||
(when (symbol=? 'set-cookie (car hdr))
|
||||
(define kv (string-trim (first (string-split (cdr hdr) ";"))))
|
||||
(match-define (list k v) (string-split kv "="))
|
||||
(hash-set! (cj-jar jar) k v)))
|
||||
(set-box!
|
||||
(cj-cache jar)
|
||||
(cons 'cookie
|
||||
(string-join
|
||||
(for/list ([(k v) (in-hash (cj-jar jar))])
|
||||
(format "~a=~a" k v)) "; "))))))
|
||||
|
||||
;; CookieJar -> (cons Symbol String)
|
||||
;; Converts this cookie jar into a cookie header for requests
|
||||
(define (cookie-jar->header jar)
|
||||
(call-with-semaphore (cj-sema jar) (lambda () (unbox (cj-cache jar)))))
|
||||
|
||||
;; HTML (XML) handling
|
||||
;; String -> xexpr
|
||||
;; Parses the given string as xml
|
||||
(define (string->xexpr str)
|
||||
(xml->xexpr (document-element (read-xml (open-input-string str)))))
|
||||
|
||||
;; xexpr -> [List-of String]
|
||||
;; Finds all flags in the given xexpr
|
||||
(define (find-flags xexpr)
|
||||
(match xexpr
|
||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||
(match (string-split (string-trim str) " ")
|
||||
;; Match the exact flag format
|
||||
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
|
||||
(list flag)]
|
||||
[_ '()])]
|
||||
[(list tag params rst ...)
|
||||
(foldl (lambda (x r) (append r (find-flags x))) '() rst)]
|
||||
[_ '()]))
|
||||
|
||||
;; xexpr -> [List-of String]
|
||||
;; Finds all <a> tag hrefs in the given xexpr
|
||||
(define (find-hrefs xexpr)
|
||||
(define (sub-find rst)
|
||||
(foldl (lambda (x r) (append r (find-hrefs x))) '() rst))
|
||||
(match xexpr
|
||||
[(list 'a params rst ...)
|
||||
(append
|
||||
(for/list ([param (in-list params)]
|
||||
#:when (symbol=? 'href (first param)))
|
||||
(second param))
|
||||
(sub-find rst))]
|
||||
[(list tag params rst ...)
|
||||
(sub-find rst)]
|
||||
[_ '()]))
|
10
secret_flags
10
secret_flags
|
@ -1,10 +0,0 @@
|
|||
0c0e8d5ceeb3da197450e74eb77010bcc0a13d1643575e72c5d573724945d9e6
|
||||
fe661b328cdd23a3d8ddd84b7b4254c525860328dd40046dd8677f91dca6341d
|
||||
8eccc00d948fc733c345c730ac86b8194aaa0376b5438b983ad00e1dd0fab73f
|
||||
388de86a629eb89093b27e9a92a29fd4e45c3aced219be1059238fd471849700
|
||||
38c7161b87ae13bc2c0a1499cf0ec4d0f6eace9f12cfb9042b26a969fd20dc48
|
||||
9b4410bffa98dc2c3f5f466bdb9bce63cec1d6d498a55d91d873cab3544cc0f4
|
||||
0fc45f2341dc966de6e350db2d0cc3070db99a5c66b06b0f8272b35bcfbf525d
|
||||
7926b87e2a1ac49700f0bccdec585fe0874f009a8ef5c5d0fe9ef7135fcf66eb
|
||||
d1584bcc2e42017db2c7146ab721d9172f2ae03d4ac42d8b6d6561125c7a212f
|
||||
c5fbc705700769d6c6bfc182f9e8cc20b340f92d4be4ffda00eb311375973542
|
|
@ -1,13 +1,8 @@
|
|||
#lang racket/base
|
||||
(provide http-msg?
|
||||
http-msg-headers
|
||||
http-msg-header
|
||||
http-msg-body
|
||||
http-rsp-code
|
||||
write-http-msg
|
||||
read-http-msg
|
||||
http-add-headers
|
||||
http-set-body
|
||||
|
||||
http-req? make-http-req
|
||||
http-res? make-http-res
|
||||
|
@ -40,7 +35,7 @@
|
|||
(and (http-msg? h) (http-start-line:res? (http-msg-start-line h))))
|
||||
|
||||
(define (make-msg sln hdrs)
|
||||
(http-add-headers (http-msg sln '() empty-http-body) hdrs))
|
||||
(http-set-headers (http-msg sln (hasheq) empty-http-body) hdrs))
|
||||
|
||||
;; (make-http-req method path [hdrs]) -> http-req?
|
||||
;; method : http-request-method?
|
||||
|
@ -55,50 +50,31 @@
|
|||
(define (make-http-res code [hdrs '()])
|
||||
(make-msg (http-start-line:res code) hdrs))
|
||||
|
||||
;; (http-set-header msg k) -> (or/c bytes? #f)
|
||||
;; msg : http-msg?
|
||||
;; k : symbol?
|
||||
(define (http-msg-header msg k)
|
||||
(cond
|
||||
[(assoc k (http-msg-headers msg)) => cdr]
|
||||
[else #f]))
|
||||
|
||||
;; (http-rsp-code msg) -> http-response-code?
|
||||
;; msg : http-rsp?
|
||||
(define (http-rsp-code msg)
|
||||
(http-start-line:res-code (http-msg-start-line msg)))
|
||||
|
||||
;; (http-add-headers msg hdrs) -> http-msg?
|
||||
;; (http-set-headers msg hdrs) -> http-msg?
|
||||
;; msg : http-msg?
|
||||
;; hdrs : (listof (cons/c symbol? any?))
|
||||
(define (http-add-headers msg hdrs)
|
||||
(define (http-set-headers msg hdrs)
|
||||
(struct-copy http-msg msg
|
||||
[headers (append (http-msg-headers msg) hdrs)]))
|
||||
[headers (for/fold ([hdrs (http-msg-headers msg)])
|
||||
([kv (in-list hdrs)])
|
||||
(if (cdr kv)
|
||||
(hash-set hdrs (car kv) (->bytes (cdr kv)))
|
||||
(hash-remove hdrs (car kv))))]))
|
||||
|
||||
;; (http-set-body msg how) -> http-msg?
|
||||
;; msg : http-msg?
|
||||
;; how : (or/c http-body? (http-body? . -> . http-body?))
|
||||
(define (http-set-body msg how)
|
||||
(define remove-old-headers
|
||||
(for/list ([kv (in-list (http-body-additional-headers (http-msg-body msg)))])
|
||||
(cons (car kv) #f)))
|
||||
(define bdy*
|
||||
(match how
|
||||
[(? http-body? bdy*) bdy*]
|
||||
[(? procedure? f) (f (http-msg-body msg))]))
|
||||
(http-add-headers (struct-copy http-msg msg [body bdy*])
|
||||
(http-body-additional-headers bdy*)))
|
||||
|
||||
;; We need an unchecked (no added headers) version because otherwise when parsing responses we add
|
||||
;; duplicate headers
|
||||
;; I was too lazy to abstract this i'm sorry milo ~ 🦈
|
||||
|
||||
;; (http-set-body/unchecked msg how) -> http-msg?
|
||||
;; msg : http-msg?
|
||||
;; how : (or/c http-body? (http-body? . -> . http-body?))
|
||||
(define (http-set-body/unchecked msg how)
|
||||
(define bdy*
|
||||
(match how
|
||||
[(? http-body? bdy*) bdy*]
|
||||
[(? procedure? f) (f (http-msg-body msg))]))
|
||||
(struct-copy http-msg msg [body bdy*]))
|
||||
(http-set-headers (struct-copy http-msg msg [body bdy*])
|
||||
(append remove-old-headers
|
||||
(http-body-additional-headers bdy*))))
|
||||
|
||||
;; (write-http-msg msg [port]) -> void?
|
||||
;; msg : http-msg?
|
||||
|
@ -110,8 +86,8 @@
|
|||
(fprintf port "~a ~a HTTP/1.1\r\n" mthd path)]
|
||||
[(http-start-line:res code)
|
||||
(fprintf port "HTTP/1.1 ~a ~a\r\n" code (http-response-code-name code))])
|
||||
(for ([kv (in-list hdrs)])
|
||||
(fprintf port "~a: ~a\r\n" (car kv) (cdr kv)))
|
||||
(for ([k (in-list (sort (hash-keys hdrs) symbol<?))])
|
||||
(fprintf port "~a: ~a\r\n" k (hash-ref hdrs k)))
|
||||
(write-bytes #"\r\n" port)
|
||||
(write-http-body body port))
|
||||
|
||||
|
@ -124,8 +100,8 @@
|
|||
(check-pred http-req? ex-req-/)
|
||||
(check-pred http-res? ex-res-ok)
|
||||
|
||||
;(check-equal? (hash-ref (http-msg-headers ex-req-/) 'host) #"localhost")
|
||||
;(check-equal? (hash-ref (http-msg-headers ex-res-ok) 'content-length) #"11")
|
||||
(check-equal? (hash-ref (http-msg-headers ex-req-/) 'host) #"localhost")
|
||||
(check-equal? (hash-ref (http-msg-headers ex-res-ok) 'content-length) #"11")
|
||||
|
||||
(define-syntax-rule (check-http-msg->bytes msg bs ...)
|
||||
(check-equal? (let ([p (open-output-bytes)])
|
||||
|
@ -142,7 +118,11 @@
|
|||
#"HTTP/1.1 200 OK\r\n"
|
||||
#"content-length: 11\r\n"
|
||||
#"\r\n"
|
||||
#"Hello world"))
|
||||
#"Hello world")
|
||||
|
||||
(check-http-msg->bytes (http-set-body ex-res-ok empty-http-body)
|
||||
#"HTTP/1.1 200 OK\r\n"
|
||||
#"\r\n"))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Message bodies
|
||||
|
@ -160,7 +140,7 @@
|
|||
(define (write-http-body bdy [port (current-output-port)])
|
||||
(match bdy
|
||||
[#f (void)]
|
||||
[(? bytes? bs) (void (write-bytes bs port))]
|
||||
[(? bytes? bs) (write-bytes bs port)]
|
||||
[(? stream? bss) (error 'write-http-body "chunked is not implemented sorry. ;(")]))
|
||||
|
||||
;; (in-http-body-chunks bdy) -> (streamof bytes)
|
||||
|
@ -180,7 +160,7 @@
|
|||
(define (http-body-additional-headers bdy)
|
||||
(match bdy
|
||||
[#f '()]
|
||||
[(? bytes? bs) `([content-length . ,(number->string (bytes-length bs))])]
|
||||
[(? bytes? bs) `([content-length . ,(bytes-length bs)])]
|
||||
[(? stream?) '([transfer-encoding . #"chunked"])]))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
|
@ -202,7 +182,8 @@
|
|||
|
||||
(define (string->header-kv s)
|
||||
(match (regexp-match #px"^([^:]+):\\s*(.*)$" s)
|
||||
[(list _ k v) (cons (string->symbol (string-downcase k)) v)]
|
||||
[(list _ k v) (cons (string->symbol (string-downcase k))
|
||||
(->bytes v))]
|
||||
[_ #f]))
|
||||
|
||||
(define (read-crlf-line port)
|
||||
|
@ -224,27 +205,25 @@
|
|||
(or (string->header-kv hln)
|
||||
(raise-read-http-error "read-http-msg: invalid header line: ~s" hln))))
|
||||
(define msg (make-msg sln hdrs))
|
||||
(define bdy (read-http-body (http-msg-header msg 'transfer-encoding)
|
||||
(http-msg-header msg 'content-length)
|
||||
(define bdy (read-http-body (hash-ref (http-msg-headers msg) 'transfer-encoding #f)
|
||||
(hash-ref (http-msg-headers msg) 'content-length #f)
|
||||
port))
|
||||
(http-set-body/unchecked msg bdy))
|
||||
(http-set-body msg bdy))
|
||||
|
||||
;; (http-body-reader tenc clen [port]) -> http-body?
|
||||
;; tenc, clen : (or/c #f bytes?)
|
||||
;; port : input-port?
|
||||
(define (read-http-body tenc clen [port (current-input-port)])
|
||||
(define len (and clen (string->int clen)))
|
||||
(define len (and clen (string->int (bytes->string/utf-8 clen))))
|
||||
(cond
|
||||
[(equal? tenc "chunked") (read-chunked-stream port)]
|
||||
[(equal? tenc #"chunked") (read-chunked-stream port)]
|
||||
[(number? len) (read-bytes len port)]
|
||||
[(and (not tenc) (not len)) #f]
|
||||
[else (raise-read-http-error "read-http-body: not sure how to read HTTP body")]))
|
||||
|
||||
;; input-port? -> [streamof bytes?]
|
||||
(define (read-chunked-stream port)
|
||||
(define len (string->number (read-crlf-line port) 16))
|
||||
(unless (integer? len)
|
||||
(raise-read-http-error "read-chunked-stream: invalid chunked encoding detected"))
|
||||
(define len (string->number (read-crlf-line port)))
|
||||
(define data (read-bytes len port))
|
||||
(match-define #"\r\n" (read-bytes 2 port))
|
||||
(if (zero? len)
|
||||
|
@ -269,9 +248,9 @@
|
|||
#"Content-Length: 5\r\n"
|
||||
#"\r\n"
|
||||
#"hello"
|
||||
(http-set-body/unchecked (make-http-res 200 '([content-type . "text/html"]
|
||||
[content-length . "5"]))
|
||||
#"hello"))
|
||||
(http-set-body (make-http-res 200 '([content-type . "text/html"]
|
||||
[content-length . 5]))
|
||||
#"hello"))
|
||||
|
||||
(let ([msg (read-http-msg
|
||||
(open-input-bytes (bytes-append #"HTTP/1.1 200 OK\r\n"
|
||||
|
|
|
@ -1,52 +1,9 @@
|
|||
#lang racket/base
|
||||
(require "./util.rkt"
|
||||
"./http-msg.rkt")
|
||||
(provide (all-from-out "./http-msg.rkt")
|
||||
http-request-method?
|
||||
http-response-code?
|
||||
http-response-code-name
|
||||
http-response-code-name)
|
||||
|
||||
http-socket?
|
||||
http-connect
|
||||
http-request
|
||||
http-close)
|
||||
|
||||
(require racket/tcp
|
||||
racket/match
|
||||
|
||||
"./util.rkt"
|
||||
"./http-msg.rkt")
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
|
||||
(define DEFAULT-PORT 80)
|
||||
|
||||
(struct http-socket [extra-headers conn-pair])
|
||||
|
||||
;; (http-connect host [port]) -> http-socket?
|
||||
;; host : string?
|
||||
;; port : port-number?
|
||||
(define (http-connect host
|
||||
[port DEFAULT-PORT]
|
||||
#:headers [x-hdrs '()])
|
||||
(http-socket `([host . ,host] ,@x-hdrs)
|
||||
(let-values ([(in out) (tcp-connect host port)])
|
||||
(cons in out))))
|
||||
|
||||
;; (http-close sock) -> void?
|
||||
;; sock : http-socket?
|
||||
(define (http-close sock)
|
||||
(match-define (cons in out) (http-socket-conn-pair sock))
|
||||
(tcp-abandon-port in)
|
||||
(tcp-abandon-port out))
|
||||
|
||||
;; (http-request sock req) -> http-res?
|
||||
;; sock : http-socket?
|
||||
;; req : http-req?
|
||||
(define (http-request sock req)
|
||||
(define req* (http-add-headers req (http-socket-extra-headers sock)))
|
||||
(match-define (cons in out) (http-socket-conn-pair sock))
|
||||
(write-http-msg req* out)
|
||||
(flush-output out)
|
||||
(read-http-msg in))
|
||||
|
||||
(module+ test
|
||||
)
|
||||
(module+ test)
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
(define http-status-codes
|
||||
#hasheqv([200 . "OK"]
|
||||
[301 . "Moved Permanently"]
|
||||
[400 . "Invalid Request"]
|
||||
[403 . "Forbidden"]
|
||||
[404 . "Not Found"]
|
||||
[500 . "Internal Server Error"]))
|
||||
|
|
204
webcrawler
204
webcrawler
|
@ -1,216 +1,22 @@
|
|||
#!/usr/bin/env racket
|
||||
#lang racket
|
||||
; vim: ft=racket
|
||||
(require smol-http)
|
||||
|
||||
; __ __ __
|
||||
; __/ // /_/ /___ _____ ____ _ __ ___ ____ __
|
||||
; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
|
||||
; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
|
||||
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
||||
; /____/
|
||||
|
||||
(require net/url-string
|
||||
smol-http
|
||||
"private/util.rkt")
|
||||
|
||||
(define HOST "fring.ccs.neu.edu")
|
||||
(define ROOT-PATH "/fakebook/")
|
||||
(define LOGIN-PATH "/accounts/login/")
|
||||
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
|
||||
;; christo pls do not track thanks
|
||||
(dnt . "1")))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; HTTP fetch logic
|
||||
|
||||
;; Returns response for request without closing the socket
|
||||
(define (crawler-fetch/noclose method path [body #f])
|
||||
;; Create the request based on the given parameters
|
||||
(define req
|
||||
(let ([basic-req (make-http-req method path '())])
|
||||
(if body (http-set-body basic-req body) basic-req)))
|
||||
;; Do the thing, with the thing
|
||||
;; Include cookie jar cookies
|
||||
(define sock
|
||||
(http-connect
|
||||
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
||||
DEFAULT-HDRS)))
|
||||
(define rsp (http-request sock req))
|
||||
;; If we got any new cookies (yum!!) add them to the jar
|
||||
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
||||
;; Abstract over some response codes we can handle directly here
|
||||
(match (http-rsp-code rsp)
|
||||
[(or 301 302)
|
||||
;; handle redirects transparently
|
||||
(define new-location (http-msg-header rsp 'location))
|
||||
(http-close sock)
|
||||
(crawler-fetch/noclose method new-location body)]
|
||||
[500
|
||||
;; handle server failure retries transparently
|
||||
(crawler-fetch/noclose method path body)]
|
||||
[_
|
||||
;; other stuff like 403/404 up to caller
|
||||
(values rsp sock)]))
|
||||
|
||||
;; Returns response for request, closing socket
|
||||
(define (crawler-fetch . params)
|
||||
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||
(http-close sock)
|
||||
rsp)
|
||||
|
||||
;; Fetches request and tries to parse the result as xexpr
|
||||
(define (crawler-fetch/xexpr . params)
|
||||
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
||||
(match (http-rsp-code rsp)
|
||||
;; return an empty response for 403 and 404. the page won't be visited again
|
||||
;; because it will have been added to the complete set
|
||||
[(or 403 404)
|
||||
(http-close sock)
|
||||
'()]
|
||||
;; we shouldn't run into this one
|
||||
[400 (error "you screwed up, got error 400 :angery:" params)]
|
||||
;; normal response yay!!
|
||||
[200
|
||||
(define xe
|
||||
(string->xexpr
|
||||
(bytes->string/utf-8
|
||||
(for/fold ([res #""])
|
||||
([chunk (in-http-body-chunks (http-msg-body rsp))])
|
||||
(bytes-append res chunk)))))
|
||||
(http-close sock)
|
||||
xe]
|
||||
[code (http-close sock) (error "unexpected response code" code)]))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Utilities
|
||||
|
||||
;; Logs in with the given username and password
|
||||
(define (crawler-login username password)
|
||||
(crawler-fetch 'GET LOGIN-PATH)
|
||||
(define form-body
|
||||
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
|
||||
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
|
||||
(crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
|
||||
(void))
|
||||
|
||||
;; Checks if this is a URL we should crawl
|
||||
(define (crawler-valid-url? page-url)
|
||||
(match page-url
|
||||
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
|
||||
[_ #f]))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Main crawler iterator
|
||||
|
||||
;; Fetches one page, prints any flags, and returns a list of urls to continue with
|
||||
;; (may contain duplicates)
|
||||
(define (crawler-iterate-one page-url)
|
||||
(let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
|
||||
[xe (crawler-fetch/xexpr 'GET path)]
|
||||
[flags (find-flags xe)]
|
||||
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
||||
(find-hrefs xe))])
|
||||
;; only return URLs that are OK to crawl
|
||||
(values (filter crawler-valid-url? page-links) flags)))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Tasking logic
|
||||
|
||||
(define NUM-TASKS 256)
|
||||
|
||||
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
|
||||
(define (crawler-worker-thread worker-id jobserver)
|
||||
(let loop ()
|
||||
(define next-job (thread-receive))
|
||||
;; jobserver sends a #f when it's time to exit
|
||||
(when next-job
|
||||
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
||||
(thread-send jobserver (list worker-id next-job new-urls flags))
|
||||
(loop))))
|
||||
|
||||
;; Starts up all the threads and schedules jobs until crawling is complete
|
||||
(define (crawler-jobserver root-url)
|
||||
;; Inner function to be run in its own thread
|
||||
(define (jobserver-thread)
|
||||
;; Create workers
|
||||
(define worker-threads
|
||||
(let ([parent (current-thread)])
|
||||
(for/vector ([i (in-range NUM-TASKS)])
|
||||
(thread (lambda () (crawler-worker-thread i parent))))))
|
||||
;; Create worker status - #t means the worker is ready for the next task
|
||||
;; #f means the worker is currently busy
|
||||
(define worker-ready (make-vector NUM-TASKS #t))
|
||||
;; URLs we have yet to crawl
|
||||
(define pending (mutable-set root-url))
|
||||
;; URLs we have crawled already
|
||||
(define completed (mutable-set))
|
||||
;; All flags seen so far
|
||||
(define all-flags (mutable-set))
|
||||
;; Main loop
|
||||
(let loop ()
|
||||
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
||||
;; Loop until there are no more busy workers and there are also no more pending URLs
|
||||
;; Then, we are done
|
||||
(unless (and (not any-busy) (set-empty? pending))
|
||||
;; Find any available worker to give the next job to
|
||||
(define available-worker
|
||||
(for/first ([w (in-naturals)]
|
||||
[r (in-vector worker-ready)]
|
||||
#:when r)
|
||||
w))
|
||||
;; If there is a worker and a job, assign the job
|
||||
(when (and (not (set-empty? pending)) available-worker)
|
||||
(define next-job (set-first pending))
|
||||
(set-remove! pending next-job)
|
||||
(thread-send (vector-ref worker-threads available-worker) next-job)
|
||||
(vector-set! worker-ready available-worker #f)
|
||||
(set! any-busy #t))
|
||||
|
||||
;; If there are no more jobs right now or there are no available workers, wait for one of
|
||||
;; the workers to complete its job
|
||||
(when (or (set-empty? pending) (false? available-worker))
|
||||
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
||||
(vector-set! worker-ready worker-id #t)
|
||||
;; Update our state with the results
|
||||
(set-add! completed job-url)
|
||||
(for ([u (in-list new-urls)])
|
||||
(unless (set-member? completed u)
|
||||
(set-add! pending u)))
|
||||
(for ([flag (in-list flags)] #:when (not (set-member? all-flags flag)))
|
||||
(print-flag flag)
|
||||
(set-add! all-flags flag)))
|
||||
|
||||
;; Useful debug status is printed in debug mode
|
||||
(print-progress (+ (set-count completed) (set-count pending))
|
||||
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
||||
(set-count completed) (set-count all-flags))
|
||||
|
||||
(loop)))
|
||||
(print-complete (set-count completed) (set-count all-flags))
|
||||
;; send all workers the shutdown message and wait
|
||||
(for ([thd (in-vector worker-threads)])
|
||||
(thread-send thd #f)
|
||||
(thread-wait thd)))
|
||||
;; start a new thread so we get a unique mailbox
|
||||
(thread-wait (thread jobserver-thread)))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; public static void main open paren string square brackets args
|
||||
(define req
|
||||
(make-http-req 'GET "/hello-world" DEFAULT-HDRS))
|
||||
|
||||
(define (run-webcrawler username password)
|
||||
(printf-debug "the credentials are: ~s ~s\n" username password)
|
||||
(printf-debug "logging in...\n")
|
||||
(crawler-login username password)
|
||||
(printf-debug "logged in. Starting crawl\n")
|
||||
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
||||
(crawler-jobserver base-url))
|
||||
(printf "the credentials are: ~s ~s\n" username password)
|
||||
(write-http-msg req))
|
||||
|
||||
;; Parse command line arguments and run
|
||||
(module+ main
|
||||
(command-line
|
||||
#:program "webcrawler"
|
||||
#:once-each
|
||||
[("-d") "Debug mode" (debug-mode? #t)]
|
||||
#:args
|
||||
(username password)
|
||||
(run-webcrawler username password)))
|
||||
|
|
Loading…
Reference in New Issue