2020-04-10 20:31:41 +00:00
|
|
|
#!/usr/bin/env racket
|
2020-04-10 23:12:50 +00:00
|
|
|
#lang racket
|
|
|
|
; vim: ft=racket
|
2020-04-10 20:36:38 +00:00
|
|
|
|
2020-04-11 04:32:34 +00:00
|
|
|
; __ __ __
|
|
|
|
; __/ // /_/ /___ _____ ____ _ __ ___ ____ __
|
|
|
|
; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
|
|
|
|
; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
|
|
|
|
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
|
|
|
; /____/
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
(require net/url-string
|
|
|
|
smol-http
|
2020-04-11 05:09:26 +00:00
|
|
|
"private/util.rkt")
|
2020-04-11 03:17:20 +00:00
|
|
|
|
2020-04-11 05:09:26 +00:00
|
|
|
(define HOST "fring.ccs.neu.edu")
|
|
|
|
(define ROOT-PATH "/fakebook/")
|
|
|
|
(define LOGIN-PATH "/accounts/login/")
|
2020-04-10 23:23:14 +00:00
|
|
|
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
|
|
|
|
;; christo pls do not track thanks
|
|
|
|
(dnt . "1")))
|
|
|
|
|
2020-04-11 08:02:33 +00:00
|
|
|
;; ---------------------------------------------------------------------------------------
|
|
|
|
;; HTTP fetch logic
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Returns response for request without closing the socket
|
|
|
|
(define (crawler-fetch/noclose method path [body #f])
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Create the request based on the given parameters
|
2020-04-11 06:43:12 +00:00
|
|
|
(define req
|
|
|
|
(let ([basic-req (make-http-req method path '())])
|
|
|
|
(if body (http-set-body basic-req body) basic-req)))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Do the thing, with the thing
|
|
|
|
;; Include cookie jar cookies
|
2020-04-11 06:43:12 +00:00
|
|
|
(define sock
|
|
|
|
(http-connect
|
|
|
|
HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
|
|
|
|
DEFAULT-HDRS)))
|
2020-04-11 05:09:26 +00:00
|
|
|
(define rsp (http-request sock req))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; If we got any new cookies (yum!!) add them to the jar
|
2020-04-11 05:09:26 +00:00
|
|
|
(update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Abstract over some response codes we can handle directly here
|
2020-04-11 06:43:12 +00:00
|
|
|
(match (http-rsp-code rsp)
|
|
|
|
[(or 301 302)
|
|
|
|
;; handle redirects transparently
|
|
|
|
(define new-location (http-msg-header rsp 'location))
|
|
|
|
(http-close sock)
|
|
|
|
(crawler-fetch/noclose method new-location body)]
|
|
|
|
[500
|
|
|
|
;; handle server failure retries transparently
|
|
|
|
(crawler-fetch/noclose method path body)]
|
|
|
|
[_
|
|
|
|
;; other stuff like 403/404 up to caller
|
|
|
|
(values rsp sock)]))
|
2020-04-11 05:46:34 +00:00
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Returns response for request, closing socket
|
|
|
|
(define (crawler-fetch . params)
|
|
|
|
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
2020-04-11 05:46:34 +00:00
|
|
|
(http-close sock)
|
|
|
|
rsp)
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Fetches request and tries to parse the result as xexpr
|
|
|
|
(define (crawler-fetch/xexpr . params)
|
|
|
|
(define-values [rsp sock] (apply crawler-fetch/noclose params))
|
|
|
|
(match (http-rsp-code rsp)
|
2020-04-11 08:57:08 +00:00
|
|
|
;; return an empty response for 403 and 404. the page won't be visited again
|
|
|
|
;; because it will have been added to the complete set
|
2020-04-11 06:43:12 +00:00
|
|
|
[(or 403 404)
|
|
|
|
(http-close sock)
|
|
|
|
'()]
|
2020-04-11 08:57:08 +00:00
|
|
|
;; we shouldn't run into this one
|
2020-04-11 08:02:33 +00:00
|
|
|
[400 (error "you screwed up, got error 400 :angery:" params)]
|
2020-04-11 08:57:08 +00:00
|
|
|
;; normal response yay!!
|
2020-04-11 06:43:12 +00:00
|
|
|
[200
|
|
|
|
(define xe
|
|
|
|
(string->xexpr
|
|
|
|
(bytes->string/utf-8
|
|
|
|
(for/fold ([res #""])
|
|
|
|
([chunk (in-http-body-chunks (http-msg-body rsp))])
|
|
|
|
(bytes-append res chunk)))))
|
|
|
|
(http-close sock)
|
|
|
|
xe]
|
|
|
|
[code (http-close sock) (error "unexpected response code" code)]))
|
2020-04-11 05:46:34 +00:00
|
|
|
|
2020-04-11 08:02:33 +00:00
|
|
|
;; ---------------------------------------------------------------------------------------
|
|
|
|
;; Utilities
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Logs in with the given username and password
|
2020-04-11 05:46:34 +00:00
|
|
|
(define (crawler-login username password)
|
2020-04-11 06:43:12 +00:00
|
|
|
(crawler-fetch 'GET LOGIN-PATH)
|
2020-04-11 05:46:34 +00:00
|
|
|
(define form-body
|
|
|
|
(format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
|
|
|
|
username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
|
2020-04-11 06:43:12 +00:00
|
|
|
(crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
|
2020-04-11 05:46:34 +00:00
|
|
|
(void))
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Checks if this is a URL we should crawl
|
|
|
|
(define (crawler-valid-url? page-url)
|
|
|
|
(match page-url
|
|
|
|
[(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
|
|
|
|
[_ #f]))
|
|
|
|
|
2020-04-11 08:02:33 +00:00
|
|
|
;; ---------------------------------------------------------------------------------------
|
|
|
|
;; Main crawler iterator
|
|
|
|
|
2020-04-11 06:43:12 +00:00
|
|
|
;; Fetches one page, prints any flags, and returns a list of urls to continue with
|
|
|
|
;; (may contain duplicates)
|
|
|
|
(define (crawler-iterate-one page-url)
|
2020-04-11 08:02:33 +00:00
|
|
|
(let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
|
2020-04-11 06:43:12 +00:00
|
|
|
[xe (crawler-fetch/xexpr 'GET path)]
|
|
|
|
[flags (find-flags xe)]
|
|
|
|
[page-links (map (lambda (x) (combine-url/relative page-url x))
|
|
|
|
(find-hrefs xe))])
|
2020-04-11 08:57:08 +00:00
|
|
|
;; only return URLs that are OK to crawl
|
2020-04-11 08:02:33 +00:00
|
|
|
(values (filter crawler-valid-url? page-links) flags)))
|
|
|
|
|
|
|
|
;; ---------------------------------------------------------------------------------------
|
|
|
|
;; Tasking logic
|
|
|
|
|
|
|
|
(define NUM-TASKS 256)
|
|
|
|
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
|
2020-04-11 08:02:33 +00:00
|
|
|
(define (crawler-worker-thread worker-id jobserver)
|
|
|
|
(let loop ()
|
|
|
|
(define next-job (thread-receive))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; jobserver sends a #f when it's time to exit
|
2020-04-11 08:02:33 +00:00
|
|
|
(when next-job
|
|
|
|
(define-values (new-urls flags) (crawler-iterate-one next-job))
|
|
|
|
(thread-send jobserver (list worker-id next-job new-urls flags))
|
|
|
|
(loop))))
|
|
|
|
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Starts up all the threads and schedules jobs until crawling is complete
|
2020-04-11 08:02:33 +00:00
|
|
|
(define (crawler-jobserver root-url)
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Inner function to be run in its own thread
|
2020-04-11 08:02:33 +00:00
|
|
|
(define (jobserver-thread)
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Create workers
|
2020-04-11 08:02:33 +00:00
|
|
|
(define worker-threads
|
|
|
|
(let ([parent (current-thread)])
|
|
|
|
(for/vector ([i (in-range NUM-TASKS)])
|
|
|
|
(thread (lambda () (crawler-worker-thread i parent))))))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Create worker status - #t means the worker is ready for the next task
|
|
|
|
;; #f means the worker is currently busy
|
2020-04-11 08:02:33 +00:00
|
|
|
(define worker-ready (make-vector NUM-TASKS #t))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; URLs we have yet to crawl
|
2020-04-11 08:02:33 +00:00
|
|
|
(define pending (mutable-set root-url))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; URLs we have crawled already
|
2020-04-11 08:02:33 +00:00
|
|
|
(define completed (mutable-set))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; All flags seen so far
|
2020-04-11 08:37:03 +00:00
|
|
|
(define all-flags (mutable-set))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Main loop
|
2020-04-11 08:02:33 +00:00
|
|
|
(let loop ()
|
|
|
|
(define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Loop until there are no more busy workers and there are also no more pending URLs
|
|
|
|
;; Then, we are done
|
2020-04-11 08:02:33 +00:00
|
|
|
(unless (and (not any-busy) (set-empty? pending))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Find any available worker to give the next job to
|
2020-04-11 08:02:33 +00:00
|
|
|
(define available-worker
|
|
|
|
(for/first ([w (in-naturals)]
|
|
|
|
[r (in-vector worker-ready)]
|
|
|
|
#:when r)
|
|
|
|
w))
|
2020-04-11 08:57:08 +00:00
|
|
|
;; If there is a worker and a job, assign the job
|
2020-04-11 08:02:33 +00:00
|
|
|
(when (and (not (set-empty? pending)) available-worker)
|
|
|
|
(define next-job (set-first pending))
|
|
|
|
(set-remove! pending next-job)
|
|
|
|
(thread-send (vector-ref worker-threads available-worker) next-job)
|
|
|
|
(vector-set! worker-ready available-worker #f)
|
|
|
|
(set! any-busy #t))
|
|
|
|
|
2020-04-11 08:57:08 +00:00
|
|
|
;; If there are no more jobs right now or there are no available workers, wait for one of
|
|
|
|
;; the workers to complete its job
|
2020-04-11 08:02:33 +00:00
|
|
|
(when (or (set-empty? pending) (false? available-worker))
|
|
|
|
(match-define (list worker-id job-url new-urls flags) (thread-receive))
|
|
|
|
(vector-set! worker-ready worker-id #t)
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Update our state with the results
|
2020-04-11 08:02:33 +00:00
|
|
|
(set-add! completed job-url)
|
|
|
|
(for ([u (in-list new-urls)])
|
|
|
|
(unless (set-member? completed u)
|
|
|
|
(set-add! pending u)))
|
2020-04-11 08:37:03 +00:00
|
|
|
(for ([flag (in-list flags)] #:when (not (set-member? all-flags flag)))
|
2020-04-11 08:02:33 +00:00
|
|
|
(print-flag flag)
|
2020-04-11 08:37:03 +00:00
|
|
|
(set-add! all-flags flag)))
|
2020-04-11 08:02:33 +00:00
|
|
|
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Useful debug status is printed in debug mode
|
2020-04-11 08:02:33 +00:00
|
|
|
(print-progress (+ (set-count completed) (set-count pending))
|
|
|
|
(for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
|
2020-04-11 08:37:03 +00:00
|
|
|
(set-count completed) (set-count all-flags))
|
2020-04-11 08:02:33 +00:00
|
|
|
|
|
|
|
(loop)))
|
2020-04-11 08:37:03 +00:00
|
|
|
(print-complete (set-count completed) (set-count all-flags))
|
2020-04-11 08:02:33 +00:00
|
|
|
;; send all workers the shutdown message and wait
|
|
|
|
(for ([thd (in-vector worker-threads)])
|
|
|
|
(thread-send thd #f)
|
|
|
|
(thread-wait thd)))
|
|
|
|
;; start a new thread so we get a unique mailbox
|
|
|
|
(thread-wait (thread jobserver-thread)))
|
|
|
|
|
|
|
|
;; ---------------------------------------------------------------------------------------
|
|
|
|
;; public static void main open paren string square brackets args
|
2020-04-11 06:43:12 +00:00
|
|
|
|
2020-04-11 05:46:34 +00:00
|
|
|
(define (run-webcrawler username password)
|
|
|
|
(printf-debug "the credentials are: ~s ~s\n" username password)
|
|
|
|
(printf-debug "logging in...\n")
|
|
|
|
(crawler-login username password)
|
2020-04-11 08:02:33 +00:00
|
|
|
(printf-debug "logged in. Starting crawl\n")
|
2020-04-11 06:43:12 +00:00
|
|
|
(define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
|
2020-04-11 08:02:33 +00:00
|
|
|
(crawler-jobserver base-url))
|
2020-04-10 23:12:50 +00:00
|
|
|
|
2020-04-11 08:57:08 +00:00
|
|
|
;; Parse command line arguments and run
|
2020-04-10 23:12:50 +00:00
|
|
|
(module+ main
|
|
|
|
(command-line
|
|
|
|
#:program "webcrawler"
|
2020-04-11 00:13:04 +00:00
|
|
|
#:once-each
|
|
|
|
[("-d") "Debug mode" (debug-mode? #t)]
|
2020-04-10 23:12:50 +00:00
|
|
|
#:args
|
|
|
|
(username password)
|
|
|
|
(run-webcrawler username password)))
|