From 49201484175e4e153dd0ddaa1aa6a722cc31916b Mon Sep 17 00:00:00 2001 From: Matthias Kretschmann Date: Wed, 21 Apr 2021 11:24:25 +0200 Subject: [PATCH] rewrite and structure --- .../tutorials/compute-to-data-algorithms.md | 225 ++++++++++++++++++ content/tutorials/write-c2d-algorithms.md | 216 ----------------- data/sidebars/tutorials.yml | 4 +- package-lock.json | 64 ++--- 4 files changed, 259 insertions(+), 250 deletions(-) create mode 100644 content/tutorials/compute-to-data-algorithms.md delete mode 100644 content/tutorials/write-c2d-algorithms.md diff --git a/content/tutorials/compute-to-data-algorithms.md b/content/tutorials/compute-to-data-algorithms.md new file mode 100644 index 00000000..c821b9bb --- /dev/null +++ b/content/tutorials/compute-to-data-algorithms.md @@ -0,0 +1,225 @@ +--- +title: Writing Algorithms for Compute to Data +description: Learn how to write algorithms for use in Ocean Protocol's Compute-to-Data feature. +--- + +## Overview + +An algorithm in the Ocean Protocol stack is another asset type, in addition to data sets. An algorithm for Compute to Data is composed of the following: + +- an algorithm code +- a Docker image (base image + tag) +- an entry point + +## Environment + +When creating an algorithm asset in Ocean Protocol, the additional `algorithm` object needs to be included in its metadata service to define the Docker container environment: + +```json +{ + "algorithm": { + "container": { + "entrypoint": "node $ALGO", + "image": "node", + "tag": "latest" + } + } +} +``` + +| Variable | Usage | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------- | +| `image` | The Docker image name the algorithm will run with. | +| `tag` | The Docker image tag that you are going to use. | +| `entrypoint` | The Docker entrypoint. `$ALGO` is a macro that gets replaced inside the compute job, depending where your algorithm code is downloaded. | + +When publishing an algorithm through the [Ocean Market](https://market.oceanprotoco.com), these properties can be set via the publish UI. + +### Environment Examples + +Run an algorithm written in JavaScript/Node.js, based on Node.js v14: + +```json +{ + "algorithm": { + "container": { + "entrypoint": "node $ALGO", + "image": "node", + "tag": "14" + } + } +} +``` + +Run an algorithm written in Python, based on Python v3.9: + +```json +{ + "algorithm": { + "container": { + "entrypoint": "python3.9 $ALGO", + "image": "python", + "tag": "3.9.4-alpine3.13" + } + } +} +``` + +Be aware that you might need a lot of dependencies, so it's a lot faster if you are going to build your own image and publish your algorithm with that custom image. We also collect some [example images](https://github.com/oceanprotocol/algo_dockers). + +### Data Storage + +As part of a compute job, every algorithm runs in a K8s pod with these volumes mounted: + +| Path | Permissions | Usage | +| --------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `/data/inputs` | read | Storage for input data sets, accessible only to the algorithm running in the pod. | +| `/data/ddos` | read | Storage for all DDOs involved in compute job (input data set + algorithm). | +| `/data/outputs` | read/write | Storage for all of the algorithm's output files. They are uploaded on some form of cloud storage, and URLs are sent back to the consumer. | +| `/data/logs/` | read/write | All algorithm output (such as `print`, `console.log`, etc.) is stored in a file located in this folder. They are stored and sent to the consumer as well. | + +### Environment variables available to algorithms + +For every algorithm pod, the Compute to Data environment provides the following environment variables: + +| Variable | Usage | +| -------------------- | ------------------------------------------------------ | +| `DIDS` | An array of DID strings containing the input datasets. | +| `TRANSFORMATION_DID` | The DID of the algorithm. | + +## Example: JavaScript/Node.js + +The following is a simple JavaScript/Node.js algorithm, doing a line count for ALL input datasets. The algorithm is not using any environment variables, but instead it's scanning the `/data/inputs` folder. + +```js +const fs = require('fs') + +const inputFolder = '/data/inputs' +const outputFolder = '/data/outputs' + +async function countrows(file) { + console.log('Start counting for ' + file) + const fileBuffer = fs.readFileSync(file) + const toString = fileBuffer.toString() + const splitLines = toString.split('\n') + const rows = splitLines.length - 1 + fs.appendFileSync(outputFolder + '/output.log', file + ',' + rows + '\r\n') + console.log('Finished. We have ' + rows + ' lines') +} + +async function processfolder(folder) { + const files = fs.readdirSync(folder) + + for (const i = 0; i < files.length; i++) { + const file = files[i] + const fullpath = folder + '/' + file + if (fs.statSync(fullpath).isDirectory()) { + await processfolder(fullpath) + } else { + await countrows(fullpath) + } + } +} + +processfolder(inputFolder) +``` + +This snippet will create and expose the following files as compute job results to the consumer: + +- `/data/outputs/output.log` +- `/data/logs/algo.log` + +To run this, use the following container object: + +```json +{ + "algorithm": { + "container": { + "entrypoint": "node $ALGO", + "image": "node", + "tag": "12" + } + } +} +``` + +## Example: Python + +A more advanced line counting in Python, which relies on environment variables and constructs a job object, containing all the input files & DDOs + +```python +import pandas as pd +import numpy as np +import os +import time +import json + +def get_job_details(): + """Reads in metadata information about assets used by the algo""" + job = dict() + job['dids'] = json.loads(os.getenv('DIDS', None)) + job['metadata'] = dict() + job['files'] = dict() + job['algo'] = dict() + job['secret'] = os.getenv('secret', None) + algo_did = os.getenv('TRANSFORMATION_DID', None) + if job['dids'] is not None: + for did in job['dids']: + # get the ddo from disk + filename = '/data/ddos/' + did + print(f'Reading json from {filename}') + with open(filename) as json_file: + ddo = json.load(json_file) + # search for metadata service + for service in ddo['service']: + if service['type'] == 'metadata': + job['files'][did] = list() + index = 0 + for file in service['attributes']['main']['files']: + job['files'][did].append( + '/data/inputs/' + did + '/' + str(index)) + index = index + 1 + if algo_did is not None: + job['algo']['did'] = algo_did + job['algo']['ddo_path'] = '/data/ddos/' + algo_did + return job + + +def line_counter(job_details): + """Executes the line counter based on inputs""" + print('Starting compute job with the following input information:') + print(json.dumps(job_details, sort_keys=True, indent=4)) + + """ Now, count the lines of the first file in first did """ + first_did = job_details['dids'][0] + filename = job_details['files'][first_did][0] + non_blank_count = 0 + with open(filename) as infp: + for line in infp: + if line.strip(): + non_blank_count += 1 + print ('number of non-blank lines found %d' % non_blank_count) + """ Print that number to output to generate algo output""" + f = open("/data/outputs/result", "w") + f.write(str(non_blank_count)) + f.close() + + +if __name__ == '__main__': + line_counter(get_job_details()) + +``` + +To run this algorithm, use the following `container` object: + +```json +{ + "algorithm": { + "container": { + "entrypoint": "python3.6 $ALGO", + "image": "oceanprotocol/algo_dockers", + "tag": "python-sql" + } + } +} +``` diff --git a/content/tutorials/write-c2d-algorithms.md b/content/tutorials/write-c2d-algorithms.md deleted file mode 100644 index 3339a6f9..00000000 --- a/content/tutorials/write-c2d-algorithms.md +++ /dev/null @@ -1,216 +0,0 @@ ---- -title: Writing C2D Algorithms -description: ---- - -## Overview - -An C2D algorithm is composed of the following: - - - a docker image (base image + tag) - - an algorithm code - - a entry point - -That's why, while creating the algorithm asset in ocean, we need the additional object "algorithm" defined in the metadata service: - -```json -"algorithm": { - "container": { - "entrypoint": "node $ALGO", - "image": "node", - "tag": "latest" - } - } -``` - -Most important attributes are the following: - - - image: this is the docker image that your are going to use - - tag: this is the docker image tag that you are going to use - - entrypoint: this is the entrypoint. $ALGO is a macro that gets replaced inside C2D, depending where your algo code is downloaded - -Here are some examples: - -- to run a JS algo, based on node 14: - - ```json - "algorithm": { - "container": { - "entrypoint": "node $ALGO", - "image": "node", - "tag": "14" - } - } - ``` - -- to run a python algo, based on python:3.9.4-alpine3.13: - - ```json - "algorithm": { - "container": { - "entrypoint": "python3.9 $ALGO", - "image": "python", - "tag": "3.9.4-alpine3.13" - } - } - ``` - -Be aware that you might need a lot of dependencies, so it's a lot faster if you are going to build your own images (we build some [HERE](https://github.com/oceanprotocol/algo_dockers)) - - - -## Data structure - -Every algorithm pod will have some volumes mounted: - - - /data/inputs (read only) - this is where the datasets are going to be stored - - /data/ddos (read only) - this is where all DDOs (input + algorithm) are going to be stored - - /data/outputs - this is where the algorithm should store all output files (they are going to be uploaded on storage and URLs sent back to the consumer) - -All algorithm output (such as print, console.log, etc) are going to be stored in a file located in /data/logs/. They are going to be stored and sent to the customer as well - - -## ENV variables available to algorithms - - -For every algorithm pod, C2D is going to provide the following ENVs: - - - DIDS: this is an array containing the input datasets - - TRANSFORMATION_DID: this is the algorithm did - - -## Sample Algorithms - -## JS example - -The following is a simple js algorithm, that does a line count for ALL input datasets. The algo is not using any ENVS, but instead it's scanning the /data/inputs folder. - -```js -const fs = require("fs") -const path = require("path") - - -var input_folder="/data/inputs"; -var output_folder="/data/outputs" - - -async function processfolder(Path) { - var files = fs.readdirSync(Path) - for (var i = 0; i < files.length; i++) { - var file=files[i]; - var fullpath=Path + "/" + file; - if (fs.statSync(fullpath).isDirectory()) { - await processfolder(fullpath) - } else { - await countrows(fullpath) - } - } -} - - -async function countrows(file){ - console.log("Start counting for "+file) - var fileBuffer = fs.readFileSync(file); - var to_string = fileBuffer.toString(); - var split_lines = to_string.split("\n"); - var rows=split_lines.length-1; - fs.appendFileSync(output_folder+'/output.log', file+','+rows+"\r\n"); - console.log('Finished. We have '+rows+' lines') -} - -processfolder(input_folder) -``` - -To run this, use the following container object: - -```json -"algorithm": { - "container": { - "entrypoint": "node $ALGO", - "image": "node", - "tag": "10" - } -} -``` - - - -## Advanced Python example - -A more advanced python line counting, which relies on ENVs and constructs a job object, containing all the input files & ddos - -```python -import pandas as pd -import numpy as np -import os -import time -import json - -def get_job_details(): - """Reads in metadata information about assets used by the algo""" - job = dict() - job['dids'] = json.loads(os.getenv('DIDS', None)) - job['metadata'] = dict() - job['files'] = dict() - job['algo'] = dict() - job['secret'] = os.getenv('secret', None) - algo_did = os.getenv('TRANSFORMATION_DID', None) - if job['dids'] is not None: - for did in job['dids']: - # get the ddo from disk - filename = '/data/ddos/' + did - print(f'Reading json from {filename}') - with open(filename) as json_file: - ddo = json.load(json_file) - # search for metadata service - for service in ddo['service']: - if service['type'] == 'metadata': - job['files'][did] = list() - index = 0 - for file in service['attributes']['main']['files']: - job['files'][did].append( - '/data/inputs/' + did + '/' + str(index)) - index = index + 1 - if algo_did is not None: - job['algo']['did'] = algo_did - job['algo']['ddo_path'] = '/data/ddos/' + algo_did - return job - - -def line_counter(job_details): - """Executes the line counter based on inputs""" - print('Starting compute job with the following input information:') - print(json.dumps(job_details, sort_keys=True, indent=4)) - - """ Now, count the lines of the first file in first did """ - first_did = job_details['dids'][0] - filename = job_details['files'][first_did][0] - non_blank_count = 0 - with open(filename) as infp: - for line in infp: - if line.strip(): - non_blank_count += 1 - print ('number of non-blank lines found %d' % non_blank_count) - """ Print that number to output to generate algo output""" - f = open("/data/outputs/result", "w") - f.write(str(non_blank_count)) - f.close() - - -if __name__ == '__main__': - line_counter(get_job_details()) - -``` - -To run this, use the following container object: - -```json -"algorithm": { - "container": { - "entrypoint": "python3.6 $ALGO", - "image": "oceanprotocol/algo_dockers", - "tag": "python-sql" - } -} -``` - diff --git a/data/sidebars/tutorials.yml b/data/sidebars/tutorials.yml index c8590f2c..c1ae57cd 100644 --- a/data/sidebars/tutorials.yml +++ b/data/sidebars/tutorials.yml @@ -24,10 +24,10 @@ - group: Compute-to-Data items: + - title: Writing Algorithms + link: /tutorials/compute-to-data-algorithms/ - title: Run a Compute-to-Data Environment link: /tutorials/compute-to-data/ - - title: Writing C2D Algorithms - link: /tutorials/write-c2d-algorithms/ - group: Storage Setup items: diff --git a/package-lock.json b/package-lock.json index 20a8ebb3..3bcd0db1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7248,9 +7248,9 @@ } }, "classnames": { - "version": "2.2.6", - "resolved": "https://registry.npmjs.org/classnames/-/classnames-2.2.6.tgz", - "integrity": "sha512-JR/iSQOSt+LQIWwrwEzJ9uk0xfN3mTVYMwt1Ir5mUcSN6pU+V4zQFFaJsclJbPuAUQH+yfWef6tm7l1quW3C8Q==" + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/classnames/-/classnames-2.3.1.tgz", + "integrity": "sha512-OlQdbZ7gLfGarSqxesMesDa5uz7KFbID8Kpq/SxIoNGDqY8lSYs0D+hhtBXhcdB3rcbXArFr7vlHheLk1voeNA==" }, "clean-stack": { "version": "2.2.0", @@ -9980,9 +9980,9 @@ } }, "eslint-config-prettier": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-8.1.0.tgz", - "integrity": "sha512-oKMhGv3ihGbCIimCAjqkdzx2Q+jthoqnXSP+d86M9tptwugycmTFdVR4IpLgq2c4SHifbwO90z2fQ8/Aio73yw==", + "version": "8.2.0", + "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-8.2.0.tgz", + "integrity": "sha512-dWV9EVeSo2qodOPi1iBYU/x6F6diHv8uujxbxr77xExs3zTAlNXvVZKiyLsQGNz7yPV2K49JY5WjPzNIuDc2Bw==", "dev": true }, "eslint-config-react-app": { @@ -10379,9 +10379,9 @@ } }, "eslint-plugin-prettier": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-3.3.1.tgz", - "integrity": "sha512-Rq3jkcFY8RYeQLgk2cCwuc0P7SEFwDravPhsJZOQ5N4YI4DSg50NyqJ/9gdZHzQlHf8MvafSesbNJCcP/FF6pQ==", + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-3.4.0.tgz", + "integrity": "sha512-UDK6rJT6INSfcOo545jiaOwB701uAIt2/dR7WnFQoGCVl1/EMqdANBmwUaqqQ45aXprsTGzSa39LI1PyuRBxxw==", "dev": true, "requires": { "prettier-linter-helpers": "^1.0.0" @@ -13675,9 +13675,9 @@ } }, "gatsby-remark-vscode": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/gatsby-remark-vscode/-/gatsby-remark-vscode-3.2.0.tgz", - "integrity": "sha512-KsI47oc5SfjZkcyNo+P6IZx3ouMWpKGuPXEDpnG70R9QOmwcrSGUJAmI37RfGP2RSTMnla+28sxzWOe1vXnL9g==", + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/gatsby-remark-vscode/-/gatsby-remark-vscode-3.2.1.tgz", + "integrity": "sha512-txzIOhfkBg49YLAw49L8PnkTu9ZK8gu61p/WbXelL0R9Abw96pmP+R4Bu1RJx3NSwikhC0nqwgORZl/qeaWwXQ==", "requires": { "decompress": "^4.2.0", "json5": "^2.1.1", @@ -15481,9 +15481,9 @@ "integrity": "sha512-SEQu7vl8KjNL2eoGBLF3+wAjpsNfA9XMlXAYj/3EdaNfAlxKthD1xjEQfGOUhllCGGJVNY34bRr6lPINhNjyZw==" }, "husky": { - "version": "5.1.3", - "resolved": "https://registry.npmjs.org/husky/-/husky-5.1.3.tgz", - "integrity": "sha512-fbNJ+Gz5wx2LIBtMweJNY1D7Uc8p1XERi5KNRMccwfQA+rXlxWNSdUxswo0gT8XqxywTIw7Ywm/F4v/O35RdMg==", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/husky/-/husky-6.0.0.tgz", + "integrity": "sha512-SQS2gDTB7tBN486QSoKPKQItZw97BMOd+Kdb6ghfpBc0yXyzrddI0oDV5MkDAbuB4X2mO3/nj60TRMcYxwzZeQ==", "dev": true }, "iconv-lite": { @@ -19636,13 +19636,13 @@ } }, "plist": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/plist/-/plist-3.0.1.tgz", - "integrity": "sha512-GpgvHHocGRyQm74b6FWEZZVRroHKE1I0/BTjAmySaohK+cUn+hZpbqXkc3KWgW3gQYkqcQej35FohcT0FRlkRQ==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/plist/-/plist-3.0.2.tgz", + "integrity": "sha512-MSrkwZBdQ6YapHy87/8hDU8MnIcyxBKjeF+McXnr5A9MtffPewTs7G3hlpodT5TacyfIyFTaJEhh3GGcmasTgQ==", "requires": { - "base64-js": "^1.2.3", + "base64-js": "^1.5.1", "xmlbuilder": "^9.0.7", - "xmldom": "0.1.x" + "xmldom": "^0.5.0" }, "dependencies": { "xmlbuilder": { @@ -22621,9 +22621,9 @@ } }, "slugify": { - "version": "1.4.7", - "resolved": "https://registry.npmjs.org/slugify/-/slugify-1.4.7.tgz", - "integrity": "sha512-tf+h5W1IrjNm/9rKKj0JU2MDMruiopx0jjVA5zCdBtcGjfp0+c5rHw/zADLC3IeKlGHtVbHtpfzvYA0OYT+HKg==" + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/slugify/-/slugify-1.5.0.tgz", + "integrity": "sha512-Q2UPZ2udzquy1ElHfOLILMBMqBEXkiD3wE75qtBvV+FsDdZZjUqPZ44vqLTejAVq+wLLHacOMcENnP8+ZbzmIA==" }, "smoothscroll-polyfill": { "version": "0.4.4", @@ -24873,14 +24873,14 @@ "integrity": "sha512-2ham8XPWTONajOR0ohOKOHXkm3+gaBmGut3SRuu75xLd/RRaY6vqgh8NBYYk7+RW3u5AtzPQZG8F10LHkl0lAQ==" }, "vscode-oniguruma": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.4.0.tgz", - "integrity": "sha512-VvTl/jIAADEqWWpEYRsOI1sXiYOTDA8KYNgK60+Mb3T+an9zPz3Cqc6RVJeYgOx/P5G+4M4jygB3X5xLLfYD0g==" + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.5.1.tgz", + "integrity": "sha512-JrBZH8DCC262TEYcYdeyZusiETu0Vli0xFgdRwNJjDcObcRjbmJP+IFcA3ScBwIXwgFHYKbAgfxtM/Cl+3Spjw==" }, "vscode-textmate": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", - "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==" + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.4.0.tgz", + "integrity": "sha512-c0Q4zYZkcLizeYJ3hNyaVUM2AA8KDhNCA3JvXY8CeZSJuBdAy3bAvSbv46RClC4P3dSO9BdwhnKEx2zOo6vP/w==" }, "warning": { "version": "4.0.3", @@ -26282,9 +26282,9 @@ "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==" }, "xmldom": { - "version": "0.1.31", - "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.1.31.tgz", - "integrity": "sha512-yS2uJflVQs6n+CyjHoaBmVSqIDevTAWrzMmjG1Gc7h1qQ7uVozNhEPJAwZXWyGQ/Gafo3fCwrcaokezLPupVyQ==" + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.5.0.tgz", + "integrity": "sha512-Foaj5FXVzgn7xFzsKeNIde9g6aFBxTPi37iwsno8QvApmtg7KYrr+OPyRHcJF7dud2a5nGRBXK3n0dL62Gf7PA==" }, "xmlhttprequest-ssl": { "version": "1.5.5",