Move util code to util.rkt
This commit is contained in:
parent
76207c5326
commit
b8f0cc5179
|
@ -0,0 +1,101 @@
|
|||
#lang racket
|
||||
|
||||
(require xml)
|
||||
|
||||
(provide debug-mode? printf-debug print-progress print-complete
|
||||
current-cookie-jar update-cookie-jar! cookie-jar->header
|
||||
find-flags find-hrefs)
|
||||
|
||||
;; scheme is bad and it should feel bad
|
||||
(define member? member)
|
||||
|
||||
;; Debug prints
|
||||
|
||||
;; Parameter for debug mode
|
||||
(define debug-mode? (make-parameter #f))
|
||||
|
||||
;; ... ->
|
||||
;; Prints the arguments as with printf, but only when debug mode is on
|
||||
(define (printf-debug . args)
|
||||
(when (debug-mode?) (apply printf args)))
|
||||
|
||||
;; Int Int Int Int ->
|
||||
;; Prints progress to the console, only when debug mode is on
|
||||
(define (print-progress total-pages in-flight crawled-pages num-flags)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a"
|
||||
crawled-pages total-pages in-flight num-flags)
|
||||
(flush-output (current-output-port))))
|
||||
|
||||
;; ->
|
||||
;; Prints a completion message to the console, only when debug mode is on
|
||||
(define (print-complete)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KCrawl complete\n")))
|
||||
|
||||
|
||||
;; Cookie jar
|
||||
(struct cj [jar cache sema] #:transparent)
|
||||
|
||||
;; -> CookieJar
|
||||
;; Creates empty cookie jar
|
||||
(define (make-cookie-jar)
|
||||
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
||||
|
||||
;; Parameter for current cookie jar
|
||||
(define current-cookie-jar (make-parameter (make-cookie-jar)))
|
||||
|
||||
;; CookieJar [List-of (cons Symbol String)] ->
|
||||
;; For a response with the given headers, finds any set-cookie headers and updates the jar
|
||||
(define (update-cookie-jar! jar hdrs)
|
||||
(call-with-semaphore
|
||||
(cj-sema jar)
|
||||
(lambda ()
|
||||
(for ([hdr (in-list hdrs)])
|
||||
(when (symbol=? 'set-cookie (car hdr))
|
||||
(define kv (string-trim (first (string-split (cdr hdr) ";"))))
|
||||
(match-define (list k v) (string-split kv "="))
|
||||
(hash-set! (cj-jar jar) k v)))
|
||||
(set-box!
|
||||
(cj-cache jar)
|
||||
(cons 'cookie
|
||||
(string-join
|
||||
(for/list ([(k v) (in-hash jar)])
|
||||
(format "~a=~a" k v)) "; "))))))
|
||||
|
||||
;; CookieJar -> (cons Symbol String)
|
||||
;; Converts this cookie jar into a cookie header for requests
|
||||
(define (cookie-jar->header jar)
|
||||
(call-with-semaphore (cj-sema jar) (curry unbox (cj-cache jar))))
|
||||
|
||||
;; HTML (XML) handling
|
||||
;; String -> xexpr
|
||||
;; Parses the given string as xml
|
||||
(define (string->xexpr str)
|
||||
(xml->xexpr (document-element (read-xml (open-input-string str)))))
|
||||
|
||||
;; xexpr -> [List-of String]
|
||||
;; Finds all flags in the given xexpr
|
||||
(define (find-flags xexpr)
|
||||
(match xexpr
|
||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||
(list str)]
|
||||
[(list tag params rst ...)
|
||||
(foldl (lambda (x r) (append r (find-flags x))) '() rst)]
|
||||
[_ '()]))
|
||||
|
||||
;; xexpr -> [List-of String]
|
||||
;; Finds all <a> tag hrefs in the given xexpr
|
||||
(define (find-hrefs xexpr)
|
||||
(define (sub-find rst)
|
||||
(foldl (lambda (x r) (append r (find-hrefs x))) '() rst))
|
||||
(match xexpr
|
||||
[(list 'a params rst ...)
|
||||
(append
|
||||
(for/list ([param (in-list params)]
|
||||
#:when (symbol=? 'href (first param)))
|
||||
(second param))
|
||||
(sub-find rst))]
|
||||
[(list tag params rst ...)
|
||||
(sub-find rst)]
|
||||
[_ '()]))
|
67
webcrawler
67
webcrawler
|
@ -1,70 +1,21 @@
|
|||
#!/usr/bin/env racket
|
||||
#lang racket
|
||||
; vim: ft=racket
|
||||
(require xml)
|
||||
(require smol-http)
|
||||
|
||||
;; scheme is bad and it should feel bad
|
||||
(define member? member)
|
||||
; __ __ __
|
||||
; __/ // /_/ /___ _____ ____ _ __ ___ ____ __
|
||||
; /_ _ __/ / __ `/ __ \/ __ `/ / / / / | /| / / / / /
|
||||
; /_ _ __/ / /_/ / / / / /_/ / / /_/ /| |/ |/ / /_/ /
|
||||
; /_//_/ /_/\__,_/_/ /_/\__, / \__,_/ |__/|__/\__,_/
|
||||
; /____/
|
||||
|
||||
(require smol-http)
|
||||
(require "private/util.rkt")
|
||||
|
||||
(define DEFAULT-HDRS '((user-agent . "🦈 hash-lang-uwu crawler v1.0")
|
||||
;; christo pls do not track thanks
|
||||
(dnt . "1")))
|
||||
|
||||
(struct cj [jar cache sema] #:transparent)
|
||||
(define (make-cookie-jar)
|
||||
(cj (make-hash) (box (cons 'cookie "")) (make-semaphore 1)))
|
||||
(define (update-cookie-jar! jar hdrs)
|
||||
(call-with-semaphore
|
||||
(cj-sema jar)
|
||||
(lambda ()
|
||||
(for ([hdr (in-list hdrs)])
|
||||
(when (symbol=? 'set-cookie (car hdr))
|
||||
(define kv (string-trim (first (string-split (cdr hdr) ";"))))
|
||||
(match-define (list k v) (string-split kv "="))
|
||||
(hash-set! (cj-jar jar) k v)))
|
||||
(set-box!
|
||||
(cj-cache jar)
|
||||
(cons 'cookie
|
||||
(string-join
|
||||
(for/list ([(k v) (in-hash jar)])
|
||||
(format "~a=~a" k v)) "; "))))))
|
||||
(define (cookie-jar->header jar)
|
||||
(call-with-semaphore (cj-sema jar) (curry unbox (cj-cache jar))))
|
||||
|
||||
(define test-data
|
||||
(xml->xexpr
|
||||
(document-element
|
||||
(read-xml (open-input-string
|
||||
"<doc><bold>hi<h2 class='secret_flag' style='color:red'>FLAG: 64-characters-of-random-alphanumerics</h2></bold> there!</doc>")))))
|
||||
|
||||
(define (find-flag xexpr)
|
||||
(match xexpr
|
||||
[(list _ (? (curry member? '(class "secret_flag"))) str)
|
||||
str]
|
||||
[(list tag params rst ...)
|
||||
(ormap find-flag rst)]
|
||||
[_ #f]))
|
||||
|
||||
(find-flag test-data)
|
||||
|
||||
|
||||
(define debug-mode? (make-parameter #f))
|
||||
(define current-cookie-jar (make-parameter (make-cookie-jar)))
|
||||
|
||||
(define (printf-debug . args)
|
||||
(when (debug-mode?) (apply printf args)))
|
||||
|
||||
(define (print-progress total-pages in-flight crawled-pages num-flags)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KStatus: ~a/~a (~a in flight) | Flags: ~a"
|
||||
crawled-pages total-pages in-flight num-flags)
|
||||
(flush-output (current-output-port))))
|
||||
|
||||
(define (print-complete)
|
||||
(when (debug-mode?)
|
||||
(printf "\r\x1b[KCrawl complete\n")))
|
||||
|
||||
(define req
|
||||
(make-http-req 'GET "/hello-world" DEFAULT-HDRS))
|
||||
|
||||
|
|
Loading…
Reference in New Issue