CS3700-project4/webcrawler

#!/usr/bin/env racket
#lang racket
; vim: ft=racket

;      __ __  __
;   __/ // /_/ /___ _____  ____ _   __  ___      ____  __
;  /_  _  __/ / __ `/ __ \/ __ `/  / / / / | /| / / / / /
; /_  _  __/ / /_/ / / / / /_/ /  / /_/ /| |/ |/ / /_/ /
;  /_//_/ /_/\__,_/_/ /_/\__, /   \__,_/ |__/|__/\__,_/
;                       /____/

(require net/url-string
         smol-http
         "private/util.rkt")

(define HOST "fring.ccs.neu.edu")
(define ROOT-PATH "/fakebook/")
(define LOGIN-PATH "/accounts/login/")
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
                       ;; christo pls do not track thanks
                       (dnt . "1")))

;; ---------------------------------------------------------------------------------------
;; HTTP fetch logic

;; Returns response for request without closing the socket
(define (crawler-fetch/noclose method path [body #f])
  ;; Create the request based on the given parameters
  (define req
    (let ([basic-req (make-http-req method path '())])
      (if body (http-set-body basic-req body) basic-req)))
  ;; Do the thing, with the thing
  ;; Include cookie jar cookies
  (define sock
    (http-connect
      HOST #:headers (cons (cookie-jar->header (current-cookie-jar))
                           DEFAULT-HDRS)))
  (define rsp (http-request sock req))
  ;; If we got any new cookies (yum!!) add them to the jar
  (update-cookie-jar! (current-cookie-jar) (http-msg-headers rsp))
  ;; Abstract over some response codes we can handle directly here
  (match (http-rsp-code rsp)
    [(or 301 302)
     ;; handle redirects transparently
     (define new-location (http-msg-header rsp 'location))
     (http-close sock)
     (crawler-fetch/noclose method new-location body)]
    [500
     ;; handle server failure retries transparently
     (crawler-fetch/noclose method path body)]
    [_
     ;; other stuff like 403/404 up to caller
     (values rsp sock)]))

;; Returns response for request, closing socket
(define (crawler-fetch . params)
  (define-values [rsp sock] (apply crawler-fetch/noclose params))
  (http-close sock)
  rsp)

;; Fetches request and tries to parse the result as xexpr
(define (crawler-fetch/xexpr . params)
  (define-values [rsp sock] (apply crawler-fetch/noclose params))
  (match (http-rsp-code rsp)
    ;; return an empty response for 403 and 404. the page won't be visited again
    ;; because it will have been added to the complete set
    [(or 403 404)
     (http-close sock)
     '()]
    ;; we shouldn't run into this one
    [400 (error "you screwed up, got error 400 :angery:" params)]
    ;; normal response yay!!
    [200
     (define xe
       (string->xexpr
         (bytes->string/utf-8
           (for/fold ([res #""])
             ([chunk (in-http-body-chunks (http-msg-body rsp))])
             (bytes-append res chunk)))))
     (http-close sock)
     xe]
    [code (http-close sock) (error "unexpected response code" code)]))

;; ---------------------------------------------------------------------------------------
;; Utilities

;; Logs in with the given username and password
(define (crawler-login username password)
  (crawler-fetch 'GET LOGIN-PATH)
  (define form-body
    (format "username=~a&password=~a&csrfmiddlewaretoken=~a&next="
            username password (cookie-jar-ref (current-cookie-jar) "csrftoken")))
  (crawler-fetch 'POST LOGIN-PATH (string->bytes/utf-8 form-body))
  (void))

;; Checks if this is a URL we should crawl
(define (crawler-valid-url? page-url)
  (match page-url
    [(url "http" _ (? (curry equal? HOST)) _ _ _ _ _) #t]
    [_ #f]))

;; ---------------------------------------------------------------------------------------
;; Main crawler iterator

;; Fetches one page, prints any flags, and returns a list of urls to continue with
;; (may contain duplicates)
(define (crawler-iterate-one page-url)
  (let* ([path (format "/~a" (string-join (map path/param-path (url-path page-url)) "/"))]
         [xe (crawler-fetch/xexpr 'GET path)]
         [flags (find-flags xe)]
         [page-links (map (lambda (x) (combine-url/relative page-url x))
                          (find-hrefs xe))])
    ;; only return URLs that are OK to crawl
    (values (filter crawler-valid-url? page-links) flags)))

;; ---------------------------------------------------------------------------------------
;; Tasking logic

(define NUM-TASKS 256)

;; Worker thread: recieves tasks, does the HTTP, returns yummy links and flags
(define (crawler-worker-thread worker-id jobserver)
  (let loop ()
    (define next-job (thread-receive))
    ;; jobserver sends a #f when it's time to exit
    (when next-job
      (define-values (new-urls flags) (crawler-iterate-one next-job))
      (thread-send jobserver (list worker-id next-job new-urls flags))
      (loop))))

;; Starts up all the threads and schedules jobs until crawling is complete
(define (crawler-jobserver root-url)
  ;; Inner function to be run in its own thread
  (define (jobserver-thread)
    ;; Create workers
    (define worker-threads
      (let ([parent (current-thread)])
        (for/vector ([i (in-range NUM-TASKS)])
          (thread (lambda () (crawler-worker-thread i parent))))))
    ;; Create worker status - #t means the worker is ready for the next task
    ;; #f means the worker is currently busy
    (define worker-ready (make-vector NUM-TASKS #t))
    ;; URLs we have yet to crawl
    (define pending (mutable-set root-url))
    ;; URLs we have crawled already
    (define completed (mutable-set))
    ;; All flags seen so far
    (define all-flags (mutable-set))
    ;; Main loop
    (let loop ()
      (define any-busy (for/or ([r (in-vector worker-ready)]) (not r)))
      ;; Loop until there are no more busy workers and there are also no more pending URLs
      ;; Then, we are done
      (unless (and (not any-busy) (set-empty? pending))
        ;; Find any available worker to give the next job to
        (define available-worker
          (for/first ([w (in-naturals)]
                      [r (in-vector worker-ready)]
                      #:when r)
                     w))
        ;; If there is a worker and a job, assign the job
        (when (and (not (set-empty? pending)) available-worker)
          (define next-job (set-first pending))
          (set-remove! pending next-job)
          (thread-send (vector-ref worker-threads available-worker) next-job)
          (vector-set! worker-ready available-worker #f)
          (set! any-busy #t))

        ;; If there are no more jobs right now or there are no available workers, wait for one of
        ;; the workers to complete its job
        (when (or (set-empty? pending) (false? available-worker))
          (match-define (list worker-id job-url new-urls flags) (thread-receive))
          (vector-set! worker-ready worker-id #t)
          ;; Update our state with the results
          (set-add! completed job-url)
          (for ([u (in-list new-urls)])
            (unless (set-member? completed u)
              (set-add! pending u)))
          (for ([flag (in-list flags)] #:when (not (set-member? all-flags flag)))
            (print-flag flag)
            (set-add! all-flags flag)))

        ;; Useful debug status is printed in debug mode
        (print-progress (+ (set-count completed) (set-count pending))
                        (for/sum ([v (in-vector worker-ready)] #:when (not v)) 1)
                        (set-count completed) (set-count all-flags))

        (loop)))
    (print-complete (set-count completed) (set-count all-flags))
    ;; send all workers the shutdown message and wait
    (for ([thd (in-vector worker-threads)])
      (thread-send thd #f)
      (thread-wait thd)))
  ;; start a new thread so we get a unique mailbox
  (thread-wait (thread jobserver-thread)))

;; ---------------------------------------------------------------------------------------
;; public static void main open paren string square brackets args

(define (run-webcrawler username password)
  (printf-debug "the credentials are: ~s ~s\n" username password)
  (printf-debug "logging in...\n")
  (crawler-login username password)
  (printf-debug "logged in. Starting crawl\n")
  (define base-url (url "http" #f HOST #f #t (list (path/param ROOT-PATH '())) '() #f))
  (crawler-jobserver base-url))

;; Parse command line arguments and run
(module+ main
  (command-line
    #:program "webcrawler"
    #:once-each
    [("-d") "Debug mode" (debug-mode? #t)]
    #:args
    (username password)
    (run-webcrawler username password)))