Implement crawling (i think!)
This commit is contained in:
parent
dabf565d2b
commit
feacb2f68f
15
Makefile
15
Makefile
|
@ -1,5 +1,13 @@
|
|||
.PHONY: all test run
|
||||
|
||||
SOURCES=$(shell git ls-tree -r master --name-only)
|
||||
CCIS_TEAM_NAME=hash-lang-uwu
|
||||
CCIS_PROJECT_NAME=project4
|
||||
CCIS_SERVER=login.ccs.neu.edu
|
||||
CCIS_PATH=CS3700/$(CCIS_PROJECT_NAME)/
|
||||
CCIS_FMT_CHECK=/course/cs3700sp20/code/project4/project4_format_check.py
|
||||
CCIS_TURNIN=/course/cs3700sp20/bin/turnin
|
||||
|
||||
all: .setup
|
||||
raco setup smol-http
|
||||
raco make webcrawler
|
||||
|
@ -19,3 +27,10 @@ endif
|
|||
.setup:
|
||||
raco pkg install smol-http/
|
||||
@touch $@
|
||||
|
||||
upload: $(SOURCES)
|
||||
@rsync -avzzR --progress $^ $(CCIS_SERVER):$(CCIS_PATH)
|
||||
@ssh $(CCIS_SERVER) -t -- "tput bold; $(CCIS_FMT_CHECK) $(CCIS_PATH); tput sgr0"
|
||||
|
||||
submit: upload
|
||||
ssh $(CCIS_SERVER) -t -- "cd $(CCIS_PATH); make clean; $(CCIS_TURNIN) $(CCIS_PROJECT_NAME) ."
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
;; Prints a flag
|
||||
(define (print-flag flag)
|
||||
(if (debug-mode?)
|
||||
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess
|
||||
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progress
|
||||
(displayln flag)))
|
||||
|
||||
;; Cookie jar
|
||||
|
@ -92,7 +92,10 @@
|
|||
(define (find-flags xexpr)
|
||||
(match xexpr
|
||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||
(list str)]
|
||||
(match (string-split (string-trim str) " ")
|
||||
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
|
||||
(list flag)]
|
||||
[_ '()])]
|
||||
[(list tag params rst ...)
|
||||
(foldl (lambda (x r) (append r (find-flags x))) '() rst)]
|
||||
[_ '()]))
|
||||
|
|
86
webcrawler
86
webcrawler
|
@ -20,6 +20,9 @@
|
|||
;; christo pls do not track thanks
|
||||
(dnt . "1")))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; HTTP fetch logic
|
||||
|
||||
;; Returns response for request without closing the socket
|
||||
(define (crawler-fetch/noclose method path [body #f])
|
||||
(define req
|
||||
|
@ -57,6 +60,7 @@
|
|||
[(or 403 404)
|
||||
(http-close sock)
|
||||
'()]
|
||||
[400 (error "you screwed up, got error 400 :angery:" params)]
|
||||
[200
|
||||
(define xe
|
||||
(string->xexpr
|
||||
|
@ -68,6 +72,9 @@
|
|||
xe]
|
||||
[code (http-close sock) (error "unexpected response code" code)]))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Utilities
|
||||
|
||||
;; Logs in with the given username and password
|
||||
(define (crawler-login username password)
|
||||
(crawler-fetch 'GET LOGIN-PATH)
|
||||
|
@ -83,26 +90,91 @@
|
|||
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
|
||||
[_ #f]))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Main crawler iterator
|
||||
|
||||
;; Fetches one page, prints any flags, and returns a list of urls to continue with
|
||||
;; (may contain duplicates)
|
||||
(define (crawler-iterate-one page-url)
|
||||
(let* ([path (string-join (map path/param-path (url-path page-url)) "/")]
|
||||
(let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
|
||||
[xe (crawler-fetch/xexpr 'GET path)]
|
||||
[flags (find-flags xe)]
|
||||
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
||||
(find-hrefs xe))])
|
||||
(for ([flag (in-list flags)])
|
||||
(print-flag flag))
|
||||
(filter crawler-valid-url? page-links)))
|
||||
(values (filter crawler-valid-url? page-links) flags)))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; Tasking logic
|
||||
|
||||
(define NUM-TASKS 256)
|
||||
|
||||
(define (crawler-worker-thread worker-id jobserver)
|
||||
(let loop ()
|
||||
(define next-job (thread-receive))
|
||||
(when next-job
|
||||
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
||||
(thread-send jobserver (list worker-id next-job new-urls flags))
|
||||
(loop))))
|
||||
|
||||
(define (crawler-jobserver root-url)
|
||||
(define (jobserver-thread)
|
||||
(define worker-threads
|
||||
(let ([parent (current-thread)])
|
||||
(for/vector ([i (in-range NUM-TASKS)])
|
||||
(thread (lambda () (crawler-worker-thread i parent))))))
|
||||
(define worker-ready (make-vector NUM-TASKS #t))
|
||||
(define pending (mutable-set root-url))
|
||||
(define completed (mutable-set))
|
||||
(define num-flags (box 0))
|
||||
(let loop ()
|
||||
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
||||
(unless (and (not any-busy) (set-empty? pending))
|
||||
(define available-worker
|
||||
(for/first ([w (in-naturals)]
|
||||
[r (in-vector worker-ready)]
|
||||
#:when r)
|
||||
w))
|
||||
(when (and (not (set-empty? pending)) available-worker)
|
||||
(define next-job (set-first pending))
|
||||
(set-remove! pending next-job)
|
||||
(thread-send (vector-ref worker-threads available-worker) next-job)
|
||||
(vector-set! worker-ready available-worker #f)
|
||||
(set! any-busy #t))
|
||||
|
||||
(when (or (set-empty? pending) (false? available-worker))
|
||||
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
||||
(vector-set! worker-ready worker-id #t)
|
||||
(set-add! completed job-url)
|
||||
(for ([u (in-list new-urls)])
|
||||
(unless (set-member? completed u)
|
||||
(set-add! pending u)))
|
||||
(for ([flag (in-list flags)])
|
||||
(print-flag flag)
|
||||
(set-box! num-flags (add1 (unbox num-flags)))))
|
||||
|
||||
(print-progress (+ (set-count completed) (set-count pending))
|
||||
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
||||
(set-count completed) (unbox num-flags))
|
||||
|
||||
(loop)))
|
||||
(print-complete)
|
||||
;; send all workers the shutdown message and wait
|
||||
(for ([thd (in-vector worker-threads)])
|
||||
(thread-send thd #f)
|
||||
(thread-wait thd)))
|
||||
;; start a new thread so we get a unique mailbox
|
||||
(thread-wait (thread jobserver-thread)))
|
||||
|
||||
;; ---------------------------------------------------------------------------------------
|
||||
;; public static void main open paren string square brackets args
|
||||
|
||||
(define (run-webcrawler username password)
|
||||
(printf-debug "the credentials are: ~s ~s\n" username password)
|
||||
(printf-debug "logging in...\n")
|
||||
(crawler-login username password)
|
||||
(printf-debug "logged in\n")
|
||||
(printf-debug "logged in. Starting crawl\n")
|
||||
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
||||
(define next-pages (crawler-iterate-one base-url))
|
||||
(displayln (map url->string next-pages)))
|
||||
(crawler-jobserver base-url))
|
||||
|
||||
(module+ main
|
||||
(command-line
|
||||
|
|
Loading…
Reference in New Issue