From 1a66ff29f383d85a93bf3b95ab8ba80f3321571c Mon Sep 17 00:00:00 2001 From: haskal Date: Sun, 10 Jan 2021 01:05:23 -0500 Subject: [PATCH] if too many errors on a task, stop trying --- README.md | 10 +++++----- crossfire/server.rkt | 26 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 7cefd8c..958db61 100644 --- a/README.md +++ b/README.md @@ -148,11 +148,11 @@ contributions welcome,,,, ## status ### base -- 🚧 input space manipulation functions +- ✅ input space manipulation functions - ✅ data types: using data/integer-set, pattern (vector of integer-set) - ✅ basic manipulation functions - ✅ representation of input space as a flat integer -- 🚧 #lang for configuration/definitions +- ✅ #lang for configuration/definitions - (input) mode - stdio: user program gets input by stdio, integers separated by space, one per line - callback: input generator compiled into user program, user main calls `crossfire_main` @@ -161,8 +161,8 @@ contributions welcome,,,, - SMP: performed by crossfire or performed by the user code - "performed by user code" can also mean GPU, for example - 🚧 codegen for input generator (in C) - - ✅ stdio mode - - 🚧 callback mode + - 🚧 stdio mode + - ✅ callback mode - ✅ success reporting mechanism - low priority: configurable "character" type -- currently a "character" is a uint64\_t @@ -172,7 +172,7 @@ contributions welcome,,,, - low priority: randomized input space distribution - low priority: store common configuration templates for clients - low priority: track upload/download progress -- streaming interface for file transfers +- ✅ streaming interface for file transfers - ✅ accept submitted projects (with client-compiled input generator) and distribute to agents - ✅ low priority: support for multiple architectures - ✅ agent authentication diff --git a/crossfire/server.rkt b/crossfire/server.rkt index a82be89..abb725f 100644 --- a/crossfire/server.rkt +++ b/crossfire/server.rkt @@ -407,6 +407,11 @@ (define *max-retry-delay* 120) (define *retry-delay-ratio* 2) +;; every time agent reports an error, we increment error count +;; every time there is success, we decrement the error count (to 0) +;; threshold for error count before we stop trying to run this task +(define *max-task-errors* 10) + (define (agent-handler) ;; unlike comms, messages to agent-handler have no responses. just thread-send, it's one-way (define cust (make-custodian)) @@ -427,7 +432,7 @@ ;; precisely than just the taskid, which allows them to cache the file locally ;; completed-work: an integer set of completed work (struct task-state [id sema manifest [work-pattern #:mutable] agent-todo file-hash - [completed-work #:mutable]] #:transparent) + [completed-work #:mutable] error-log] #:transparent) (define (initialize-task id mf) (define file-hash (server-hash-file id)) @@ -441,7 +446,8 @@ ([(_1 _2 _3 pat-fasl) (in-query (current-db) q-get-task-log id)]) (define sub (make-integer-set (fasl->s-exp pat-fasl))) (values (integer-set-subtract pattern-range sub) (integer-set-union completed-work sub)))) - (task-state id sema mf pattern-range agent-todo file-hash completed-work)) + (define error-log (make-hash)) + (task-state id sema mf pattern-range agent-todo file-hash completed-work error-log)) (define (task-has-work? ts) (not (and (zero? (integer-set-count (task-state-work-pattern ts))) @@ -498,8 +504,10 @@ assignment])))) ;; returns work from agent back to the regular work pool - (define (task-unassign! ts agent-id) + (define (task-unassign! ts agent-id [error? #f]) (call-with-semaphore (task-state-sema ts) (lambda () + (when error? + (hash-update! (task-state-error-log ts) agent-id add1 0)) (match (hash-ref (task-state-agent-todo ts) agent-id #f) [#f (void)] [assignment @@ -511,6 +519,7 @@ ;; adds to task log, then updates work pool with task completion (define (task-complete! ts agent-id time-wall-start duration) (call-with-semaphore (task-state-sema ts) (lambda () + (hash-update! (task-state-error-log ts) agent-id (lambda (x) (max 0 (sub1 x))) 0) (match (hash-ref (task-state-agent-todo ts) agent-id #f) [#f (void)] [assignment @@ -619,12 +628,12 @@ (integer-set-contents assign-data))) #t])) - (define (cancel-assignment! assignment) + (define (cancel-assignment! assignment [error? #f]) ;; tell the agent to cancel work, unassign the assignment (define ts (hash-ref current-tasks (assignment-taskid assignment) #f)) ;; update manager tracking (unless (false? ts) - (task-unassign! ts id)) + (task-unassign! ts id error?)) (hash-remove! assigned-tasks (assignment-id assignment)) ;; send agent rpc (with-handlers ([exn:fail? @@ -705,10 +714,12 @@ ['() #f] [(cons head tail) (define manifest (task-state-manifest head)) + (define error-count (hash-ref (task-state-error-log head) id 0)) (define needed-resources (list->set (manifest-data-ref manifest 'resources '()))) (define needed-arch (manifest-data-ref manifest 'arch '("any"))) (define right-arch? (or (member "any" needed-arch) (member arch needed-arch))) (if (and right-arch? + (< error-count *max-task-errors*) (task-has-work? head) (subset? needed-resources available-resources)) (create-assignment! head) @@ -747,11 +758,12 @@ (unless (false? av) (complete-assignment! av)) (update-assignments!)] ;; execution failed, unassign task - ;; TODO : maybe blacklist this task from this agent if there are too many errors ;; TODO : notify connected clients that an error occurred [(cons assignment-id 'error) (define av (hash-ref assigned-tasks assignment-id #f)) - (unless (false? av) (cancel-assignment! av)) + (unless (false? av) + ;; #t for yes, this is an error + (cancel-assignment! av #t)) (update-assignments!)] ;; got succeeding input [(cons assignment-id success-input)