Implement crawling (i think!)

This commit is contained in:
xenia 2020-04-11 04:02:33 -04:00
parent dabf565d2b
commit feacb2f68f
3 changed files with 99 additions and 9 deletions

View File

@ -1,5 +1,13 @@
.PHONY: all test run .PHONY: all test run
SOURCES=$(shell git ls-tree -r master --name-only)
CCIS_TEAM_NAME=hash-lang-uwu
CCIS_PROJECT_NAME=project4
CCIS_SERVER=login.ccs.neu.edu
CCIS_PATH=CS3700/$(CCIS_PROJECT_NAME)/
CCIS_FMT_CHECK=/course/cs3700sp20/code/project4/project4_format_check.py
CCIS_TURNIN=/course/cs3700sp20/bin/turnin
all: .setup all: .setup
raco setup smol-http raco setup smol-http
raco make webcrawler raco make webcrawler
@ -19,3 +27,10 @@ endif
.setup: .setup:
raco pkg install smol-http/ raco pkg install smol-http/
@touch $@ @touch $@
upload: $(SOURCES)
@rsync -avzzR --progress $^ $(CCIS_SERVER):$(CCIS_PATH)
@ssh $(CCIS_SERVER) -t -- "tput bold; $(CCIS_FMT_CHECK) $(CCIS_PATH); tput sgr0"
submit: upload
ssh $(CCIS_SERVER) -t -- "cd $(CCIS_PATH); make clean; $(CCIS_TURNIN) $(CCIS_PROJECT_NAME) ."

View File

@ -37,7 +37,7 @@
;; Prints a flag ;; Prints a flag
(define (print-flag flag) (define (print-flag flag)
(if (debug-mode?) (if (debug-mode?)
(printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess (printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progress
(displayln flag))) (displayln flag)))
;; Cookie jar ;; Cookie jar
@ -92,7 +92,10 @@
(define (find-flags xexpr) (define (find-flags xexpr)
(match xexpr (match xexpr
[(list _ (? (curry member? '(class "secret_flag"))) str) [(list _ (? (curry member? '(class "secret_flag"))) str)
(list str)] (match (string-split (string-trim str) " ")
[(list "FLAG:" (? (compose (curry = 64) string-length) flag))
(list flag)]
[_ '()])]
[(list tag params rst ...) [(list tag params rst ...)
(foldl (lambda (x r) (append r (find-flags x))) '() rst)] (foldl (lambda (x r) (append r (find-flags x))) '() rst)]
[_ '()])) [_ '()]))

View File

@ -20,6 +20,9 @@
;; christo pls do not track thanks ;; christo pls do not track thanks
(dnt . "1"))) (dnt . "1")))
;; ---------------------------------------------------------------------------------------
;; HTTP fetch logic
;; Returns response for request without closing the socket ;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f]) (define (crawler-fetch/noclose method path [body #f])
(define req (define req
@ -57,6 +60,7 @@
[(or 403 404) [(or 403 404)
(http-close sock) (http-close sock)
'()] '()]
[400 (error "you screwed up, got error 400 :angery:" params)]
[200 [200
(define xe (define xe
(string->xexpr (string->xexpr
@ -68,6 +72,9 @@
xe] xe]
[code (http-close sock) (error "unexpected response code" code)])) [code (http-close sock) (error "unexpected response code" code)]))
;; ---------------------------------------------------------------------------------------
;; Utilities
;; Logs in with the given username and password ;; Logs in with the given username and password
(define (crawler-login username password) (define (crawler-login username password)
(crawler-fetch 'GET LOGIN-PATH) (crawler-fetch 'GET LOGIN-PATH)
@ -83,26 +90,91 @@
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
[_ #f])) [_ #f]))
;; ---------------------------------------------------------------------------------------
;; Main crawler iterator
;; Fetches one page, prints any flags, and returns a list of urls to continue with ;; Fetches one page, prints any flags, and returns a list of urls to continue with
;; (may contain duplicates) ;; (may contain duplicates)
(define (crawler-iterate-one page-url) (define (crawler-iterate-one page-url)
(let* ([path (string-join (map path/param-path (url-path page-url)) "/")] (let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
[xe (crawler-fetch/xexpr 'GET path)] [xe (crawler-fetch/xexpr 'GET path)]
[flags (find-flags xe)] [flags (find-flags xe)]
[page-links (map (lambda (x) (combine-url/relative page-url x)) [page-links (map (lambda (x) (combine-url/relative page-url x))
(find-hrefs xe))]) (find-hrefs xe))])
(for ([flag (in-list flags)]) (values (filter crawler-valid-url? page-links) flags)))
(print-flag flag))
(filter crawler-valid-url? page-links))) ;; ---------------------------------------------------------------------------------------
;; Tasking logic
(define NUM-TASKS 256)
(define (crawler-worker-thread worker-id jobserver)
(let loop ()
(define next-job (thread-receive))
(when next-job
(define-values (new-urls flags) (crawler-iterate-one next-job))
(thread-send jobserver (list worker-id next-job new-urls flags))
(loop))))
(define (crawler-jobserver root-url)
(define (jobserver-thread)
(define worker-threads
(let ([parent (current-thread)])
(for/vector ([i (in-range NUM-TASKS)])
(thread (lambda () (crawler-worker-thread i parent))))))
(define worker-ready (make-vector NUM-TASKS #t))
(define pending (mutable-set root-url))
(define completed (mutable-set))
(define num-flags (box 0))
(let loop ()
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
(unless (and (not any-busy) (set-empty? pending))
(define available-worker
(for/first ([w (in-naturals)]
[r (in-vector worker-ready)]
#:when r)
w))
(when (and (not (set-empty? pending)) available-worker)
(define next-job (set-first pending))
(set-remove! pending next-job)
(thread-send (vector-ref worker-threads available-worker) next-job)
(vector-set! worker-ready available-worker #f)
(set! any-busy #t))
(when (or (set-empty? pending) (false? available-worker))
(match-define (list worker-id job-url new-urls flags) (thread-receive))
(vector-set! worker-ready worker-id #t)
(set-add! completed job-url)
(for ([u (in-list new-urls)])
(unless (set-member? completed u)
(set-add! pending u)))
(for ([flag (in-list flags)])
(print-flag flag)
(set-box! num-flags (add1 (unbox num-flags)))))
(print-progress (+ (set-count completed) (set-count pending))
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
(set-count completed) (unbox num-flags))
(loop)))
(print-complete)
;; send all workers the shutdown message and wait
(for ([thd (in-vector worker-threads)])
(thread-send thd #f)
(thread-wait thd)))
;; start a new thread so we get a unique mailbox
(thread-wait (thread jobserver-thread)))
;; ---------------------------------------------------------------------------------------
;; public static void main open paren string square brackets args
(define (run-webcrawler username password) (define (run-webcrawler username password)
(printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "the credentials are: ~s ~s\n" username password)
(printf-debug "logging in...\n") (printf-debug "logging in...\n")
(crawler-login username password) (crawler-login username password)
(printf-debug "logged in\n") (printf-debug "logged in. Starting crawl\n")
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
(define next-pages (crawler-iterate-one base-url)) (crawler-jobserver base-url))
(displayln (map url->string next-pages)))
(module+ main (module+ main
(command-line (command-line