Compare commits

...

21 Commits

Author SHA1 Message Date
xenia 57a2e1dbac Fix makefile 2020-04-12 02:03:16 -04:00
Milo Turner ea6bc67489 flags get 2020-04-12 01:49:16 -04:00
xenia 4475e50cc5 Add comments (and now it's sleepy time :iitalics: :blobcatsleepreach:) 2020-04-11 04:57:08 -04:00
xenia 1c9c029768 i'm bad and it's 4am 2020-04-11 04:37:03 -04:00
xenia bd63237c17 Fix issue with duplicate flags 2020-04-11 04:31:33 -04:00
xenia 23f7407366 Update readme; update debug prints 2020-04-11 04:22:05 -04:00
xenia f57aef07c0 Add readme and flags 2020-04-11 04:08:13 -04:00
xenia feacb2f68f Implement crawling (i think!) 2020-04-11 04:02:33 -04:00
xenia dabf565d2b Add crawler handling for correct urls, response codes 301, 302, 403, 404, 500 2020-04-11 02:43:12 -04:00
xenia e39b9addfe Implement login 2020-04-11 01:46:34 -04:00
xenia 66ca83ded6 Fix bugs in cookiejar and transfer-encoding 2020-04-11 01:46:30 -04:00
xenia e3fefc7af4 Fix some mishandling of headers to make cookiejar work 2020-04-11 01:09:26 -04:00
xenia b8f0cc5179 Move util code to util.rkt 2020-04-11 00:32:34 -04:00
Milo Turner 76207c5326 fix comment 2020-04-10 23:34:42 -04:00
Milo Turner 1b84b5119e http-msg-headers is alist now 2020-04-10 23:33:45 -04:00
xenia e1af5acd3f Find flag 2020-04-10 23:17:24 -04:00
Milo Turner 4baeabb263 http-socket stuff 2020-04-10 23:17:01 -04:00
Milo Turner ac0a113c83 add 400 status code 2020-04-10 23:17:01 -04:00
Milo Turner 6e564342c0 forgot to provide these from http-msg
.
2020-04-10 23:17:01 -04:00
Milo Turner 6157a14f37 force-http-body 2020-04-10 23:17:01 -04:00
xenia 47e830154c Add debug mode and cookie jar 2020-04-10 21:48:56 -04:00
8 changed files with 496 additions and 48 deletions

View File

@ -1,4 +1,12 @@
.PHONY: all test run
.PHONY: all test run clean
SOURCES=Makefile README.md private/util.rkt secret_flags smol-http/http-msg.rkt smol-http/info.rkt smol-http/main.rkt smol-http/util.rkt webcrawler
CCIS_TEAM_NAME=hash-lang-uwu
CCIS_PROJECT_NAME=project4
CCIS_SERVER=login.ccs.neu.edu
CCIS_PATH=CS3700/$(CCIS_PROJECT_NAME)/
CCIS_FMT_CHECK=/course/cs3700sp20/code/project4/project4_format_check.py
CCIS_TURNIN=/course/cs3700sp20/bin/turnin
all: .setup
raco setup smol-http
@ -14,8 +22,20 @@ endif
ifndef FB_PASSWORD
$(error "Provide FB_PASSWORD variable")
endif
./webcrawler $(FB_USERNAME) $(FB_PASSWORD)
./webcrawler -d $(FB_USERNAME) $(FB_PASSWORD)
.setup:
raco pkg install smol-http/
-raco pkg install smol-http/
@touch $@
upload: $(SOURCES)
@rsync -avzzR --progress $^ $(CCIS_SERVER):$(CCIS_PATH)
@ssh $(CCIS_SERVER) -t -- "tput bold; $(CCIS_FMT_CHECK) $(CCIS_PATH); tput sgr0"
submit: upload
ssh $(CCIS_SERVER) -t -- "cd $(CCIS_PATH); make clean; $(CCIS_TURNIN) $(CCIS_PROJECT_NAME) ."
clean:
$(RM) .setup
find . \( -iname '*.zo' -o -iname '*.dep' \) -delete
find . -type d -iname compiled -delete

31
README.md Normal file
View File

@ -0,0 +1,31 @@
__ __ __
__/ // /_/ /___ _____ ____ _ __ ___ ____ __
/_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
/_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
/_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
/____/
## High level approach
We started by creating robust abstracted HTTP-handling code, which is located in the `smol-http`
module of this project. The HTTP code implements a subset of HTTP 1.1 which is enough to meet the
requirements for crawling the target web server. It also uses plain TCP sockets to communicate using
its HTTP implementation. We used Racket standard library functions to parse and manipulate URLs as
well as parse HTML (as XML, hopefully it's well-formed!) in order to find the hyperlinks on the page
as well as the flags. We implemented a high performance Certified Web Scale(tm) crawling scheduler
with a distributed work queue to allow for very high rate crawling, the crawler on our machines
takes minutes to complete, and finds all the flags very quickly.
## Challenges
The current pandemic situation continues to make this semester difficult. Otherwise, we didn't run
into any major issues during this project.
## Testing
We unit tested the HTTP handling code in smol-http, and used ad-hoc manual testing against the
target server to test the complete crawling functionality.
We have an additional `-d` flag which will print useful debug info during the execution of the
crawler, which may be helpful for manual testing.

121
private/util.rkt Normal file
View File

@ -0,0 +1,121 @@
#lang racket
(require xml)
(provide debug-mode? printf-debug print-progress print-complete print-flag
current-cookie-jar update-cookie-jar! cookie-jar->header cookie-jar-ref
string->xexpr find-flags find-hrefs)
;; scheme is bad and it should feel bad
(define member? member)
;; Debug prints
;; Parameter for debug mode
(define debug-mode? (make-parameter #f))
;; ... ->
;; Prints the arguments as with printf, but only when debug mode is on
(define (printf-debug . args)
(when (debug-mode?) (apply printf args)))
;; Int Int Int Int ->
;; Prints progress to the console, only when debug mode is on
(define (print-progress total-pages in-flight crawled-pages num-flags)
(when (debug-mode?)
(printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a"
crawled-pages total-pages in-flight num-flags)
(flush-output (current-output-port))))
;; ->
;; Prints a completion message to the console, only when debug mode is on
(define (print-complete total-pages num-flags)
(when (debug-mode?)
(printf "\r\x1b[KCrawl complete: ~a pages crawled, ~a flags found\n"
total-pages num-flags)))
;; Str ->
;; Prints a flag
(define (print-flag flag)
(if (debug-mode?)
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progress
(displayln flag)))
;; Cookie jar
(struct cj [jar cache sema] #:transparent)
;; -> CookieJar
;; Creates empty cookie jar
;; Since Racket threads are preemptive (gross) we include a semaphore to guard against concurrent
;; modification
(define (make-cookie-jar)
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
;; Parameter for current cookie jar
(define current-cookie-jar (make-parameter (make-cookie-jar)))
;; CookieJar -> String
;; Gets a value from a cookie jar
(define (cookie-jar-ref jar key)
(call-with-semaphore
(cj-sema jar)
(lambda () (hash-ref (cj-jar jar) key))))
;; CookieJar [List-of (cons Symbol String)] ->
;; For a response with the given headers, finds any set-cookie headers and updates the jar
(define (update-cookie-jar! jar hdrs)
(call-with-semaphore
(cj-sema jar)
(lambda ()
(for ([hdr (in-list hdrs)])
(when (symbol=? 'set-cookie (car hdr))
(define kv (string-trim (first (string-split (cdr hdr) ";"))))
(match-define (list k v) (string-split kv "="))
(hash-set! (cj-jar jar) k v)))
(set-box!
(cj-cache jar)
(cons 'cookie
(string-join
(for/list ([(k v) (in-hash (cj-jar jar))])
(format "~a=~a" k v)) "; "))))))
;; CookieJar -> (cons Symbol String)
;; Converts this cookie jar into a cookie header for requests
(define (cookie-jar->header jar)
(call-with-semaphore (cj-sema jar) (lambda () (unbox (cj-cache jar)))))
;; HTML (XML) handling
;; String -> xexpr
;; Parses the given string as xml
(define (string->xexpr str)
(xml->xexpr (document-element (read-xml (open-input-string str)))))
;; xexpr -> [List-of String]
;; Finds all flags in the given xexpr
(define (find-flags xexpr)
(match xexpr
[(list _ (? (curry member? '(class "secret_flag"))) str)
(match (string-split (string-trim str) " ")
;; Match the exact flag format
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
(list flag)]
[_ '()])]
[(list tag params rst ...)
(foldl (lambda (x r) (append r (find-flags x))) '() rst)]
[_ '()]))
;; xexpr -> [List-of String]
;; Finds all <a> tag hrefs in the given xexpr
(define (find-hrefs xexpr)
(define (sub-find rst)
(foldl (lambda (x r) (append r (find-hrefs x))) '() rst))
(match xexpr
[(list 'a params rst ...)
(append
(for/list ([param (in-list params)]
#:when (symbol=? 'href (first param)))
(second param))
(sub-find rst))]
[(list tag params rst ...)
(sub-find rst)]
[_ '()]))

10
secret_flags Normal file
View File

@ -0,0 +1,10 @@
0c0e8d5ceeb3da197450e74eb77010bcc0a13d1643575e72c5d573724945d9e6
fe661b328cdd23a3d8ddd84b7b4254c525860328dd40046dd8677f91dca6341d
8eccc00d948fc733c345c730ac86b8194aaa0376b5438b983ad00e1dd0fab73f
388de86a629eb89093b27e9a92a29fd4e45c3aced219be1059238fd471849700
38c7161b87ae13bc2c0a1499cf0ec4d0f6eace9f12cfb9042b26a969fd20dc48
9b4410bffa98dc2c3f5f466bdb9bce63cec1d6d498a55d91d873cab3544cc0f4
0fc45f2341dc966de6e350db2d0cc3070db99a5c66b06b0f8272b35bcfbf525d
7926b87e2a1ac49700f0bccdec585fe0874f009a8ef5c5d0fe9ef7135fcf66eb
d1584bcc2e42017db2c7146ab721d9172f2ae03d4ac42d8b6d6561125c7a212f
c5fbc705700769d6c6bfc182f9e8cc20b340f92d4be4ffda00eb311375973542

View File

@ -1,8 +1,13 @@
#lang racket/base
(provide http-msg?
http-msg-headers
http-msg-header
http-msg-body
http-rsp-code
write-http-msg
read-http-msg
http-add-headers
http-set-body
http-req? make-http-req
http-res? make-http-res
@ -10,6 +15,7 @@
http-body?
empty-http-body
write-http-body
force-http-body
in-http-body-chunks)
(require racket/match
@ -34,7 +40,7 @@
(and (http-msg? h) (http-start-line:res? (http-msg-start-line h))))
(define (make-msg sln hdrs)
(http-set-headers (http-msg sln (hasheq) empty-http-body) hdrs))
(http-add-headers (http-msg sln '() empty-http-body) hdrs))
;; (make-http-req method path [hdrs]) -> http-req?
;; method : http-request-method?
@ -49,31 +55,50 @@
(define (make-http-res code [hdrs '()])
(make-msg (http-start-line:res code) hdrs))
;; (http-set-headers msg hdrs) -> http-msg?
;; (http-set-header msg k) -> (or/c bytes? #f)
;; msg : http-msg?
;; k : symbol?
(define (http-msg-header msg k)
(cond
[(assoc k (http-msg-headers msg)) => cdr]
[else #f]))
;; (http-rsp-code msg) -> http-response-code?
;; msg : http-rsp?
(define (http-rsp-code msg)
(http-start-line:res-code (http-msg-start-line msg)))
;; (http-add-headers msg hdrs) -> http-msg?
;; msg : http-msg?
;; hdrs : (listof (cons/c symbol? any?))
(define (http-set-headers msg hdrs)
(define (http-add-headers msg hdrs)
(struct-copy http-msg msg
[headers (for/fold ([hdrs (http-msg-headers msg)])
([kv (in-list hdrs)])
(if (cdr kv)
(hash-set hdrs (car kv) (->bytes (cdr kv)))
(hash-remove hdrs (car kv))))]))
[headers (append (http-msg-headers msg) hdrs)]))
;; (http-set-body msg how) -> http-msg?
;; msg : http-msg?
;; how : (or/c http-body? (http-body? . -> . http-body?))
(define (http-set-body msg how)
(define remove-old-headers
(for/list ([kv (in-list (http-body-additional-headers (http-msg-body msg)))])
(cons (car kv) #f)))
(define bdy*
(match how
[(? http-body? bdy*) bdy*]
[(? procedure? f) (f (http-msg-body msg))]))
(http-set-headers (struct-copy http-msg msg [body bdy*])
(append remove-old-headers
(http-body-additional-headers bdy*))))
(http-add-headers (struct-copy http-msg msg [body bdy*])
(http-body-additional-headers bdy*)))
;; We need an unchecked (no added headers) version because otherwise when parsing responses we add
;; duplicate headers
;; I was too lazy to abstract this i'm sorry milo ~ 🦈
;; (http-set-body/unchecked msg how) -> http-msg?
;; msg : http-msg?
;; how : (or/c http-body? (http-body? . -> . http-body?))
(define (http-set-body/unchecked msg how)
(define bdy*
(match how
[(? http-body? bdy*) bdy*]
[(? procedure? f) (f (http-msg-body msg))]))
(struct-copy http-msg msg [body bdy*]))
;; (write-http-msg msg [port]) -> void?
;; msg : http-msg?
@ -85,8 +110,8 @@
(fprintf port "~a ~a HTTP/1.1\r\n" mthd path)]
[(http-start-line:res code)
(fprintf port "HTTP/1.1 ~a ~a\r\n" code (http-response-code-name code))])
(for ([k (in-list (sort (hash-keys hdrs) symbol<?))])
(fprintf port "~a: ~a\r\n" k (hash-ref hdrs k)))
(for ([kv (in-list hdrs)])
(fprintf port "~a: ~a\r\n" (car kv) (cdr kv)))
(write-bytes #"\r\n" port)
(write-http-body body port))
@ -99,8 +124,8 @@
(check-pred http-req? ex-req-/)
(check-pred http-res? ex-res-ok)
(check-equal? (hash-ref (http-msg-headers ex-req-/) 'host) #"localhost")
(check-equal? (hash-ref (http-msg-headers ex-res-ok) 'content-length) #"11")
;(check-equal? (hash-ref (http-msg-headers ex-req-/) 'host) #"localhost")
;(check-equal? (hash-ref (http-msg-headers ex-res-ok) 'content-length) #"11")
(define-syntax-rule (check-http-msg->bytes msg bs ...)
(check-equal? (let ([p (open-output-bytes)])
@ -117,11 +142,7 @@
#"HTTP/1.1 200 OK\r\n"
#"content-length: 11\r\n"
#"\r\n"
#"Hello world")
(check-http-msg->bytes (http-set-body ex-res-ok empty-http-body)
#"HTTP/1.1 200 OK\r\n"
#"\r\n"))
#"Hello world"))
;; ---------------------------------------------------------------------------------------
;; Message bodies
@ -139,7 +160,7 @@
(define (write-http-body bdy [port (current-output-port)])
(match bdy
[#f (void)]
[(? bytes? bs) (write-bytes bs port)]
[(? bytes? bs) (void (write-bytes bs port))]
[(? stream? bss) (error 'write-http-body "chunked is not implemented sorry. ;(")]))
;; (in-http-body-chunks bdy) -> (streamof bytes)
@ -150,10 +171,16 @@
[(? bytes? bs) (list bs)]
[(? stream? bss) bss]))
;; (force-http-body bdy) -> void?
;; bdy : http-body?
(define (force-http-body bdy)
(when (stream? bdy)
(for ([x bdy]) (void))))
(define (http-body-additional-headers bdy)
(match bdy
[#f '()]
[(? bytes? bs) `([content-length . ,(bytes-length bs)])]
[(? bytes? bs) `([content-length . ,(number->string (bytes-length bs))])]
[(? stream?) '([transfer-encoding . #"chunked"])]))
;; ---------------------------------------------------------------------------------------
@ -175,8 +202,7 @@
(define (string->header-kv s)
(match (regexp-match #px"^([^:]+):\\s*(.*)$" s)
[(list _ k v) (cons (string->symbol (string-downcase k))
(->bytes v))]
[(list _ k v) (cons (string->symbol (string-downcase k)) v)]
[_ #f]))
(define (read-crlf-line port)
@ -198,25 +224,27 @@
(or (string->header-kv hln)
(raise-read-http-error "read-http-msg: invalid header line: ~s" hln))))
(define msg (make-msg sln hdrs))
(define bdy (read-http-body (hash-ref (http-msg-headers msg) 'transfer-encoding #f)
(hash-ref (http-msg-headers msg) 'content-length #f)
(define bdy (read-http-body (http-msg-header msg 'transfer-encoding)
(http-msg-header msg 'content-length)
port))
(http-set-body msg bdy))
(http-set-body/unchecked msg bdy))
;; (http-body-reader tenc clen [port]) -> http-body?
;; tenc, clen : (or/c #f bytes?)
;; port : input-port?
(define (read-http-body tenc clen [port (current-input-port)])
(define len (and clen (string->int (bytes->string/utf-8 clen))))
(define len (and clen (string->int clen)))
(cond
[(equal? tenc #"chunked") (read-chunked-stream port)]
[(equal? tenc "chunked") (read-chunked-stream port)]
[(number? len) (read-bytes len port)]
[(and (not tenc) (not len)) #f]
[else (raise-read-http-error "read-http-body: not sure how to read HTTP body")]))
;; input-port? -> [streamof bytes?]
(define (read-chunked-stream port)
(define len (string->number (read-crlf-line port)))
(define len (string->number (read-crlf-line port) 16))
(unless (integer? len)
(raise-read-http-error "read-chunked-stream: invalid chunked encoding detected"))
(define data (read-bytes len port))
(match-define #"\r\n" (read-bytes 2 port))
(if (zero? len)
@ -241,9 +269,9 @@
#"Content-Length: 5\r\n"
#"\r\n"
#"hello"
(http-set-body (make-http-res 200 '([content-type . "text/html"]
[content-length . 5]))
#"hello"))
(http-set-body/unchecked (make-http-res 200 '([content-type . "text/html"]
[content-length . "5"]))
#"hello"))
(let ([msg (read-http-msg
(open-input-bytes (bytes-append #"HTTP/1.1 200 OK\r\n"

View File

@ -1,9 +1,52 @@
#lang racket/base
(require "./util.rkt"
"./http-msg.rkt")
(provide (all-from-out "./http-msg.rkt")
http-request-method?
http-response-code?
http-response-code-name)
http-response-code-name
(module+ test)
http-socket?
http-connect
http-request
http-close)
(require racket/tcp
racket/match
"./util.rkt"
"./http-msg.rkt")
;; ---------------------------------------------------------------------------------------
(define DEFAULT-PORT 80)
(struct http-socket [extra-headers conn-pair])
;; (http-connect host [port]) -> http-socket?
;; host : string?
;; port : port-number?
(define (http-connect host
[port DEFAULT-PORT]
#:headers [x-hdrs '()])
(http-socket `([host . ,host] ,@x-hdrs)
(let-values ([(in out) (tcp-connect host port)])
(cons in out))))
;; (http-close sock) -> void?
;; sock : http-socket?
(define (http-close sock)
(match-define (cons in out) (http-socket-conn-pair sock))
(tcp-abandon-port in)
(tcp-abandon-port out))
;; (http-request sock req) -> http-res?
;; sock : http-socket?
;; req : http-req?
(define (http-request sock req)
(define req* (http-add-headers req (http-socket-extra-headers sock)))
(match-define (cons in out) (http-socket-conn-pair sock))
(write-http-msg req* out)
(flush-output out)
(read-http-msg in))
(module+ test
)

View File

@ -16,6 +16,7 @@
(define http-status-codes
#hasheqv([200 . "OK"]
[301 . "Moved Permanently"]
[400 . "Invalid Request"]
[403 . "Forbidden"]
[404 . "Not Found"]
[500 . "Internal Server Error"]))

View File

@ -1,22 +1,216 @@
#!/usr/bin/env racket
#lang racket
; vim: ft=racket
(require smol-http)
; __ __ __
; __/ // /_/ /___ _____ ____ _ __ ___ ____ __
; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
; /____/
(require net/url-string
smol-http
"private/util.rkt")
(define HOST "fring.ccs.neu.edu")
(define ROOT-PATH "/fakebook/")
(define LOGIN-PATH "/accounts/login/")
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
;; christo pls do not track thanks
(dnt . "1")))
(define req
(make-http-req 'GET "/hello-world" DEFAULT-HDRS))
;; ---------------------------------------------------------------------------------------
;; HTTP fetch logic
;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f])
;; Create the request based on the given parameters
(define req
(let ([basic-req (make-http-req method path '())])
(if body (http-set-body basic-req body) basic-req)))
;; Do the thing, with the thing
;; Include cookie jar cookies
(define sock
(http-connect
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
DEFAULT-HDRS)))
(define rsp (http-request sock req))
;; If we got any new cookies (yum!!) add them to the jar
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
;; Abstract over some response codes we can handle directly here
(match (http-rsp-code rsp)
[(or 301 302)
;; handle redirects transparently
(define new-location (http-msg-header rsp 'location))
(http-close sock)
(crawler-fetch/noclose method new-location body)]
[500
;; handle server failure retries transparently
(crawler-fetch/noclose method path body)]
[_
;; other stuff like 403/404 up to caller
(values rsp sock)]))
;; Returns response for request, closing socket
(define (crawler-fetch . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
(http-close sock)
rsp)
;; Fetches request and tries to parse the result as xexpr
(define (crawler-fetch/xexpr . params)
(define-values [rsp sock] (apply crawler-fetch/noclose params))
(match (http-rsp-code rsp)
;; return an empty response for 403 and 404. the page won't be visited again
;; because it will have been added to the complete set
[(or 403 404)
(http-close sock)
'()]
;; we shouldn't run into this one
[400 (error "you screwed up, got error 400 :angery:" params)]
;; normal response yay!!
[200
(define xe
(string->xexpr
(bytes->string/utf-8
(for/fold ([res #""])
([chunk (in-http-body-chunks (http-msg-body rsp))])
(bytes-append res chunk)))))
(http-close sock)
xe]
[code (http-close sock) (error "unexpected response code" code)]))
;; ---------------------------------------------------------------------------------------
;; Utilities
;; Logs in with the given username and password
(define (crawler-login username password)
(crawler-fetch 'GET LOGIN-PATH)
(define form-body
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
(crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
(void))
;; Checks if this is a URL we should crawl
(define (crawler-valid-url? page-url)
(match page-url
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
[_ #f]))
;; ---------------------------------------------------------------------------------------
;; Main crawler iterator
;; Fetches one page, prints any flags, and returns a list of urls to continue with
;; (may contain duplicates)
(define (crawler-iterate-one page-url)
(let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
[xe (crawler-fetch/xexpr 'GET path)]
[flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))])
;; only return URLs that are OK to crawl
(values (filter crawler-valid-url? page-links) flags)))
;; ---------------------------------------------------------------------------------------
;; Tasking logic
(define NUM-TASKS 256)
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
(define (crawler-worker-thread worker-id jobserver)
(let loop ()
(define next-job (thread-receive))
;; jobserver sends a #f when it's time to exit
(when next-job
(define-values (new-urls flags) (crawler-iterate-one next-job))
(thread-send jobserver (list worker-id next-job new-urls flags))
(loop))))
;; Starts up all the threads and schedules jobs until crawling is complete
(define (crawler-jobserver root-url)
;; Inner function to be run in its own thread
(define (jobserver-thread)
;; Create workers
(define worker-threads
(let ([parent (current-thread)])
(for/vector ([i (in-range NUM-TASKS)])
(thread (lambda () (crawler-worker-thread i parent))))))
;; Create worker status - #t means the worker is ready for the next task
;; #f means the worker is currently busy
(define worker-ready (make-vector NUM-TASKS #t))
;; URLs we have yet to crawl
(define pending (mutable-set root-url))
;; URLs we have crawled already
(define completed (mutable-set))
;; All flags seen so far
(define all-flags (mutable-set))
;; Main loop
(let loop ()
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
;; Loop until there are no more busy workers and there are also no more pending URLs
;; Then, we are done
(unless (and (not any-busy) (set-empty? pending))
;; Find any available worker to give the next job to
(define available-worker
(for/first ([w (in-naturals)]
[r (in-vector worker-ready)]
#:when r)
w))
;; If there is a worker and a job, assign the job
(when (and (not (set-empty? pending)) available-worker)
(define next-job (set-first pending))
(set-remove! pending next-job)
(thread-send (vector-ref worker-threads available-worker) next-job)
(vector-set! worker-ready available-worker #f)
(set! any-busy #t))
;; If there are no more jobs right now or there are no available workers, wait for one of
;; the workers to complete its job
(when (or (set-empty? pending) (false? available-worker))
(match-define (list worker-id job-url new-urls flags) (thread-receive))
(vector-set! worker-ready worker-id #t)
;; Update our state with the results
(set-add! completed job-url)
(for ([u (in-list new-urls)])
(unless (set-member? completed u)
(set-add! pending u)))
(for ([flag (in-list flags)] #:when (not (set-member? all-flags flag)))
(print-flag flag)
(set-add! all-flags flag)))
;; Useful debug status is printed in debug mode
(print-progress (+ (set-count completed) (set-count pending))
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
(set-count completed) (set-count all-flags))
(loop)))
(print-complete (set-count completed) (set-count all-flags))
;; send all workers the shutdown message and wait
(for ([thd (in-vector worker-threads)])
(thread-send thd #f)
(thread-wait thd)))
;; start a new thread so we get a unique mailbox
(thread-wait (thread jobserver-thread)))
;; ---------------------------------------------------------------------------------------
;; public static void main open paren string square brackets args
(define (run-webcrawler username password)
(printf "the credentials are: ~s ~s\n" username password)
(write-http-msg req))
(printf-debug "the credentials are: ~s ~s\n" username password)
(printf-debug "logging in...\n")
(crawler-login username password)
(printf-debug "logged in. Starting crawl\n")
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(crawler-jobserver base-url))
;; Parse command line arguments and run
(module+ main
(command-line
#:program "webcrawler"
#:once-each
[("-d") "Debug mode" (debug-mode? #t)]
#:args
(username password)
(run-webcrawler username password)))