From feacb2f68ff4625d261a3d22c5467906aa630d82 Mon Sep 17 00:00:00 2001 From: haskal Date: Sat, 11 Apr 2020 04:02:33 -0400 Subject: [PATCH] Implement crawling (i think!) --- Makefile | 15 +++++++++ private/util.rkt | 7 ++-- webcrawler | 86 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 99 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 4a09791..a22360f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,13 @@ .PHONY: all test run +SOURCES=$(shell git ls-tree -r master --name-only) +CCIS_TEAM_NAME=hash-lang-uwu +CCIS_PROJECT_NAME=project4 +CCIS_SERVER=login.ccs.neu.edu +CCIS_PATH=CS3700/$(CCIS_PROJECT_NAME)/ +CCIS_FMT_CHECK=/course/cs3700sp20/code/project4/project4_format_check.py +CCIS_TURNIN=/course/cs3700sp20/bin/turnin + all: .setup raco setup smol-http raco make webcrawler @@ -19,3 +27,10 @@ endif .setup: raco pkg install smol-http/ @touch $@ + +upload: $(SOURCES) + @rsync -avzzR --progress $^ $(CCIS_SERVER):$(CCIS_PATH) + @ssh $(CCIS_SERVER) -t -- "tput bold; $(CCIS_FMT_CHECK) $(CCIS_PATH); tput sgr0" + +submit: upload + ssh $(CCIS_SERVER) -t -- "cd $(CCIS_PATH); make clean; $(CCIS_TURNIN) $(CCIS_PROJECT_NAME) ." diff --git a/private/util.rkt b/private/util.rkt index b112452..67e61f3 100644 --- a/private/util.rkt +++ b/private/util.rkt @@ -37,7 +37,7 @@ ;; Prints a flag (define (print-flag flag) (if (debug-mode?) - (printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progess + (printf "\r\x1b[K~a\n" flag) ; make sure not to garble the debug progress (displayln flag))) ;; Cookie jar @@ -92,7 +92,10 @@ (define (find-flags xexpr) (match xexpr [(list _ (? (curry member? '(class "secret_flag"))) str) - (list str)] + (match (string-split (string-trim str) " ") + [(list "FLAG:" (? (compose (curry = 64) string-length) flag)) + (list flag)] + [_ '()])] [(list tag params rst ...) (foldl (lambda (x r) (append r (find-flags x))) '() rst)] [_ '()])) diff --git a/webcrawler b/webcrawler index 09d0a74..bfcb466 100755 --- a/webcrawler +++ b/webcrawler @@ -20,6 +20,9 @@ ;; christo pls do not track thanks (dnt . "1"))) +;; --------------------------------------------------------------------------------------- +;; HTTP fetch logic + ;; Returns response for request without closing the socket (define (crawler-fetch/noclose method path [body #f]) (define req @@ -57,6 +60,7 @@ [(or 403 404) (http-close sock) '()] + [400 (error "you screwed up, got error 400 :angery:" params)] [200 (define xe (string->xexpr @@ -68,6 +72,9 @@ xe] [code (http-close sock) (error "unexpected response code" code)])) +;; --------------------------------------------------------------------------------------- +;; Utilities + ;; Logs in with the given username and password (define (crawler-login username password) (crawler-fetch 'GET LOGIN-PATH) @@ -83,26 +90,91 @@ [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t] [_ #f])) +;; --------------------------------------------------------------------------------------- +;; Main crawler iterator + ;; Fetches one page, prints any flags, and returns a list of urls to continue with ;; (may contain duplicates) (define (crawler-iterate-one page-url) - (let* ([path (string-join (map path/param-path (url-path page-url)) "/")] + (let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))] [xe (crawler-fetch/xexpr 'GET path)] [flags (find-flags xe)] [page-links (map (lambda (x) (combine-url/relative page-url x)) (find-hrefs xe))]) - (for ([flag (in-list flags)]) - (print-flag flag)) - (filter crawler-valid-url? page-links))) + (values (filter crawler-valid-url? page-links) flags))) + +;; --------------------------------------------------------------------------------------- +;; Tasking logic + +(define NUM-TASKS 256) + +(define (crawler-worker-thread worker-id jobserver) + (let loop () + (define next-job (thread-receive)) + (when next-job + (define-values (new-urls flags) (crawler-iterate-one next-job)) + (thread-send jobserver (list worker-id next-job new-urls flags)) + (loop)))) + +(define (crawler-jobserver root-url) + (define (jobserver-thread) + (define worker-threads + (let ([parent (current-thread)]) + (for/vector ([i (in-range NUM-TASKS)]) + (thread (lambda () (crawler-worker-thread i parent)))))) + (define worker-ready (make-vector NUM-TASKS #t)) + (define pending (mutable-set root-url)) + (define completed (mutable-set)) + (define num-flags (box 0)) + (let loop () + (define any-busy (for/or ([r (in-vector worker-ready)]) (not r))) + (unless (and (not any-busy) (set-empty? pending)) + (define available-worker + (for/first ([w (in-naturals)] + [r (in-vector worker-ready)] + #:when r) + w)) + (when (and (not (set-empty? pending)) available-worker) + (define next-job (set-first pending)) + (set-remove! pending next-job) + (thread-send (vector-ref worker-threads available-worker) next-job) + (vector-set! worker-ready available-worker #f) + (set! any-busy #t)) + + (when (or (set-empty? pending) (false? available-worker)) + (match-define (list worker-id job-url new-urls flags) (thread-receive)) + (vector-set! worker-ready worker-id #t) + (set-add! completed job-url) + (for ([u (in-list new-urls)]) + (unless (set-member? completed u) + (set-add! pending u))) + (for ([flag (in-list flags)]) + (print-flag flag) + (set-box! num-flags (add1 (unbox num-flags))))) + + (print-progress (+ (set-count completed) (set-count pending)) + (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1) + (set-count completed) (unbox num-flags)) + + (loop))) + (print-complete) + ;; send all workers the shutdown message and wait + (for ([thd (in-vector worker-threads)]) + (thread-send thd #f) + (thread-wait thd))) + ;; start a new thread so we get a unique mailbox + (thread-wait (thread jobserver-thread))) + +;; --------------------------------------------------------------------------------------- +;; public static void main open paren string square brackets args (define (run-webcrawler username password) (printf-debug "the credentials are: ~s ~s\n" username password) (printf-debug "logging in...\n") (crawler-login username password) - (printf-debug "logged in\n") + (printf-debug "logged in. Starting crawl\n") (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f)) - (define next-pages (crawler-iterate-one base-url)) - (displayln (map url->string next-pages))) + (crawler-jobserver base-url)) (module+ main (command-line