{ pkgs, lib, stdlib, jqlib, pythonlib, # TODO python env... processlib, }: with lib; mkModule (self: { # A class is basically an identifier namespace. class1 = mkClass {}; # Another class of entities. May have overlapping identifiers with class1 but there is no correlation class2 = mkClass {}; # A repository is an attribute of an entity of a given class. It is the core data storage abstraction. repo1 = mkRepo self.class1 stdlib.dtypes.blob {}; repo2 = mkRepo self.class1 stdlib.dtypes.json { schema = with stdlib.dtypes.json.schema; dictKeysComplete { related = int; subkey = dictOf str (listOf str); }; }; # A repository can be declared on each class repo3 = mkRepo self.class2 stdlib.dtypes.json {}; repo4 = mkRepo self.class2 stdlib.dtypes.blob {}; # A view is a derived repository. There are many ways to describe a transformation between inputs and outputs. # Here we use jq to query over an input repository. We also cast it to a foreign key, which is the id of an entity of a given class. relation = mkView self.class1 jqlib.expr { inputs.input.repo = self.repo2; query = "$input.related"; cast = stdlib.dtypes.foreignKeyOf self.class2; }; # Here, we use the previous foreign key repository to create a new repository on class1 which contains data from repo3 (declared on class2) # by specifying that the this repository should be accessed through a foreign entity. repoRelated = mkRelated self.class1 self.relation self.repo3 {}; # The declaration of a view is a shorthand for declaring a function and binding it to some input and output repositories. # Here we do it in long form func1 = mkFunc pythonlib.func { inputs.one.dtype = stdlib.dtypes.json; inputs.two.dtype = stdlib.dtypes.blob; outputs.return.dtype = stdlib.dtypes.json; module = pkgs.writeText "func1.py" '' def func1(one, two): return { "one": one["yay"], "two": two.read(), } ''; function = "func1"; }; bind1 = mkBinding self.class1 { func = self.func1; inputs.one = self.repo2; inputs.two = self.repo1; outputs.return = self.repoRelated; }; streamFunc = mkFunc processlib.processFunc { inputs.stdin = { dtype = stdlib.dtypes.json; format = processlib.formats.streamOf processlib.formats.yaml; }; outputs.out = { type = stdlib.dtypes.seqOf stdlib.dtypes.blob; format = processlib.formats.filepathOf (processlib.formats.watchdirOf processlib.formats.file); }; executable = pkgs.writeShellScript "streamFunc.sh" '' id=0 grep whatever | while read -r line; do md5sum >$out/$id <<<$line id=$((id + 1)) done ''; }; tupleFunc = mkFunc processlib.processFunc { inputs.stdin = { dtype = stdlib.dtypes.json; format = processlib.formats.streamOf processlib.formats.yaml; }; outputs.out = { type = stdlib.dtypes.seqOf (stdlib.dtypes.tupleOf { data = stdlib.dtypes.blob; metadata = stdlib.dtypes.json; }); format = processlib.formats.filepathOf (processlib.formats.watchdirOf (processlib.formats.tupleDirOf { data = processlib.formats.file; metadata = processlib.formats.yaml; })); }; executable = pkgs.writeShellScript "tupleFunc.sh" '' id=0 grep whatever | while read -r line; do mkdir -p $out/$id md5sum >$out/$id/data <<<$line cat >$out/$id/metadata <