From b8f0cc5179a82f9a69a7046d318ce595cca9bee1 Mon Sep 17 00:00:00 2001 From: haskal Date: Sat, 11 Apr 2020 00:32:34 -0400 Subject: [PATCH] Move util code to util.rkt --- private/util.rkt | 101 +++++++++++++++++++++++++++++++++++++++++++++++ webcrawler | 67 +++++-------------------------- 2 files changed, 110 insertions(+), 58 deletions(-) create mode 100644 private/util.rkt diff --git a/private/util.rkt b/private/util.rkt new file mode 100644 index 0000000..60167a5 --- /dev/null +++ b/private/util.rkt @@ -0,0 +1,101 @@ +#lang racket + +(require xml) + +(provide debug-mode? printf-debug print-progress print-complete + current-cookie-jar update-cookie-jar! cookie-jar->header + find-flags find-hrefs) + +;; scheme is bad and it should feel bad +(define member? member) + +;; Debug prints + +;; Parameter for debug mode +(define debug-mode? (make-parameter #f)) + +;; ... -> +;; Prints the arguments as with printf, but only when debug mode is on +(define (printf-debug . args) + (when (debug-mode?) (apply printf args))) + +;; Int Int Int Int -> +;; Prints progress to the console, only when debug mode is on +(define (print-progress total-pages in-flight crawled-pages num-flags) + (when (debug-mode?) + (printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a" + crawled-pages total-pages in-flight num-flags) + (flush-output (current-output-port)))) + +;; -> +;; Prints a completion message to the console, only when debug mode is on +(define (print-complete) + (when (debug-mode?) + (printf "\r\x1b[KCrawl complete\n"))) + + +;; Cookie jar +(struct cj [jar cache sema] #:transparent) + +;; -> CookieJar +;; Creates empty cookie jar +(define (make-cookie-jar) + (cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1))) + +;; Parameter for current cookie jar +(define current-cookie-jar (make-parameter (make-cookie-jar))) + +;; CookieJar [List-of (cons Symbol String)] -> +;; For a response with the given headers, finds any set-cookie headers and updates the jar +(define (update-cookie-jar! jar hdrs) + (call-with-semaphore + (cj-sema jar) + (lambda () + (for ([hdr (in-list hdrs)]) + (when (symbol=? 'set-cookie (car hdr)) + (define kv (string-trim (first (string-split (cdr hdr) ";")))) + (match-define (list k v) (string-split kv "=")) + (hash-set! (cj-jar jar) k v))) + (set-box! + (cj-cache jar) + (cons 'cookie + (string-join + (for/list ([(k v) (in-hash jar)]) + (format "~a=~a" k v)) "; ")))))) + +;; CookieJar -> (cons Symbol String) +;; Converts this cookie jar into a cookie header for requests +(define (cookie-jar->header jar) + (call-with-semaphore (cj-sema jar) (curry unbox (cj-cache jar)))) + +;; HTML (XML) handling +;; String -> xexpr +;; Parses the given string as xml +(define (string->xexpr str) + (xml->xexpr (document-element (read-xml (open-input-string str))))) + +;; xexpr -> [List-of String] +;; Finds all flags in the given xexpr +(define (find-flags xexpr) + (match xexpr + [(list _ (? (curry member? '(class "secret_flag"))) str) + (list str)] + [(list tag params rst ...) + (foldl (lambda (x r) (append r (find-flags x))) '() rst)] + [_ '()])) + +;; xexpr -> [List-of String] +;; Finds all tag hrefs in the given xexpr +(define (find-hrefs xexpr) + (define (sub-find rst) + (foldl (lambda (x r) (append r (find-hrefs x))) '() rst)) + (match xexpr + [(list 'a params rst ...) + (append + (for/list ([param (in-list params)] + #:when (symbol=? 'href (first param))) + (second param)) + (sub-find rst))] + [(list tag params rst ...) + (sub-find rst)] + [_ '()])) diff --git a/webcrawler b/webcrawler index 4407176..04df549 100755 --- a/webcrawler +++ b/webcrawler @@ -1,70 +1,21 @@ #!/usr/bin/env racket #lang racket ; vim: ft=racket -(require xml) -(require smol-http) -;; scheme is bad and it should feel bad -(define member? member) +; __ __ __ +; __/ // /_/ /___ _____ ____ _ __ ___ ____ __ +; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / / +; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ / +; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/ +; /____/ + +(require smol-http) +(require "private/util.rkt") (define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0") ;; christo pls do not track thanks (dnt . "1"))) -(struct cj [jar cache sema] #:transparent) -(define (make-cookie-jar) - (cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1))) -(define (update-cookie-jar! jar hdrs) - (call-with-semaphore - (cj-sema jar) - (lambda () - (for ([hdr (in-list hdrs)]) - (when (symbol=? 'set-cookie (car hdr)) - (define kv (string-trim (first (string-split (cdr hdr) ";")))) - (match-define (list k v) (string-split kv "=")) - (hash-set! (cj-jar jar) k v))) - (set-box! - (cj-cache jar) - (cons 'cookie - (string-join - (for/list ([(k v) (in-hash jar)]) - (format "~a=~a" k v)) "; ")))))) -(define (cookie-jar->header jar) - (call-with-semaphore (cj-sema jar) (curry unbox (cj-cache jar)))) - -(define test-data - (xml->xexpr - (document-element - (read-xml (open-input-string - "hi

FLAG: 64-characters-of-random-alphanumerics

there!
"))))) - -(define (find-flag xexpr) - (match xexpr - [(list _ (? (curry member? '(class "secret_flag"))) str) - str] - [(list tag params rst ...) - (ormap find-flag rst)] - [_ #f])) - -(find-flag test-data) - - -(define debug-mode? (make-parameter #f)) -(define current-cookie-jar (make-parameter (make-cookie-jar))) - -(define (printf-debug . args) - (when (debug-mode?) (apply printf args))) - -(define (print-progress total-pages in-flight crawled-pages num-flags) - (when (debug-mode?) - (printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a" - crawled-pages total-pages in-flight num-flags) - (flush-output (current-output-port)))) - -(define (print-complete) - (when (debug-mode?) - (printf "\r\x1b[KCrawl complete\n"))) - (define req (make-http-req 'GET "/hello-world" DEFAULT-HDRS))