-
Notifications
You must be signed in to change notification settings - Fork 255
feat(CLDSRV-884): Add OpenTelemetry tracing instrumentation #6140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development/9.3
Are you sure you want to change the base?
Changes from all commits
b71ccfb
cefcc5e
f6c0065
6f904c4
5fca1b0
e068307
1a650b4
3300abd
fb2ed26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| 'use strict'; | ||
|
|
||
| const tracing = require('../tracing'); | ||
|
|
||
| let tracer = null; | ||
| function getTracer() { | ||
| if (tracer) { | ||
| return tracer; | ||
| } | ||
| const { trace } = require('@opentelemetry/api'); | ||
| const { version } = require('../../package.json'); | ||
| tracer = trace.getTracer('cloudserver-api', version); | ||
| return tracer; | ||
| } | ||
|
|
||
| function instrumentApiMethod(apiMethod, methodName) { | ||
| if (!tracing.isEnabled()) { | ||
| return apiMethod; | ||
| } | ||
|
|
||
| const api = require('@opentelemetry/api'); | ||
| const spanName = `api.${methodName}`; | ||
|
|
||
| return function instrumented(...args) { | ||
| const callbackIndex = args.findLastIndex(a => typeof a === 'function'); | ||
| const span = getTracer().startSpan(spanName, { kind: api.SpanKind.INTERNAL }); | ||
|
|
||
| // End-once guard. Multiple termination paths can race: the | ||
| // wrapped callback may fire and then the handler may also throw | ||
| // synchronously, or a callback-and-Promise hybrid handler may | ||
| // resolve after firing the callback. | ||
| let spanEnded = false; | ||
| const endSpan = err => { | ||
| if (spanEnded) { | ||
| return; | ||
| } | ||
| spanEnded = true; | ||
| if (err) { | ||
| span.recordException(err); | ||
| span.setStatus({ code: api.SpanStatusCode.ERROR }); | ||
| if (err.code) { | ||
| span.setAttribute('cloudserver.error_code', err.code); | ||
| } | ||
| } else { | ||
| span.setStatus({ code: api.SpanStatusCode.OK }); | ||
| } | ||
| span.end(); | ||
| }; | ||
|
|
||
| const wrappedArgs = [...args]; | ||
| if (callbackIndex !== -1) { | ||
| const originalCallback = args[callbackIndex]; | ||
| wrappedArgs[callbackIndex] = function wrappedCallback(err, ...results) { | ||
| endSpan(err); | ||
| return originalCallback.call(this, err, ...results); | ||
| }; | ||
| } | ||
|
|
||
| const ctx = api.trace.setSpan(api.context.active(), span); | ||
| try { | ||
| const result = api.context.with(ctx, () => | ||
| apiMethod.apply(this, wrappedArgs)); | ||
| if (callbackIndex === -1) { | ||
| if (result && typeof result.then === 'function') { | ||
| return result.then( | ||
| value => { endSpan(); return value; }, | ||
| err => { endSpan(err); throw err; }, | ||
| ); | ||
Check noticeCode scanning / CodeQL Promise .then() usage (async migration) Note
This call uses .then(). Refactor to async/await.
|
||
|
Comment on lines
+65
to
+68
|
||
| } | ||
| endSpan(); | ||
| } | ||
| // Callback-style handler: the wrapped callback drives the | ||
| // span lifecycle. If the handler also returns a thenable | ||
| // (hybrid migration shape), pass it through untouched — | ||
| // attaching a second .then() chain would surface as an | ||
| // unhandled rejection in callback-only callers. | ||
| return result; | ||
| } catch (error) { | ||
|
delthas marked this conversation as resolved.
|
||
| endSpan(error); | ||
| throw error; | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| module.exports = { instrumentApiMethod }; | ||
|
delthas marked this conversation as resolved.
delthas marked this conversation as resolved.
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we use about that ? We usually trace them as they can have an impact (as you said high-frequency). The filter is not done here but in the tool used to explore trace ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both options exist; we deliberately filter at ingest. Three reasons:
The filter is |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| 'use strict'; | ||
|
|
||
| // Probe + scrape paths that should never produce a span. Filtered at | ||
| // ingest (not at the trace backend) because probe rate × pod count × | ||
| // always-on sampling overwhelms the exporter and storage with traffic | ||
| // nobody queries. | ||
| const HEALTH_PATHS = new Set([ | ||
| '/live', | ||
| '/ready', | ||
| '/_/healthcheck', | ||
| '/_/healthcheck/deep', | ||
| '/metrics', | ||
| ]); | ||
|
|
||
| function isHealthPath(url) { | ||
| if (typeof url !== 'string' || url.length === 0) { | ||
| return false; | ||
| } | ||
| const qIdx = url.indexOf('?'); | ||
| const path = qIdx === -1 ? url : url.slice(0, qIdx); | ||
| return HEALTH_PATHS.has(path); | ||
| } | ||
|
|
||
| module.exports = { isHealthPath }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| 'use strict'; | ||
|
|
||
| const { buildTrustedHosts, makeRequestHook } = require('./trustedHosts'); | ||
| const { isHealthPath } = require('./healthPaths'); | ||
|
|
||
| let sdk = null; | ||
|
|
||
| function isEnabled() { | ||
| return process.env.ENABLE_OTEL === 'true'; | ||
| } | ||
|
|
||
| function init() { | ||
| if (!isEnabled() || sdk) { | ||
| return; | ||
| } | ||
|
|
||
| const endpoint = process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; | ||
| if (!endpoint) { | ||
| throw new Error( | ||
| 'ENABLE_OTEL=true but OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is unset', | ||
| ); | ||
| } | ||
|
|
||
| const { diag, DiagConsoleLogger, DiagLogLevel } = require('@opentelemetry/api'); | ||
| const { NodeSDK } = require('@opentelemetry/sdk-node'); | ||
| const { resourceFromAttributes } = require('@opentelemetry/resources'); | ||
| const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-http'); | ||
| const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http'); | ||
| const { IORedisInstrumentation } = require('@opentelemetry/instrumentation-ioredis'); | ||
| const { MongoDBInstrumentation } = require('@opentelemetry/instrumentation-mongodb'); | ||
| const { | ||
| ParentBasedSampler, | ||
| TraceIdRatioBasedSampler, | ||
| } = require('@opentelemetry/sdk-trace-base'); | ||
| const { version } = require('../../package.json'); | ||
| const { config } = require('../Config'); | ||
|
|
||
| diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.WARN); | ||
|
|
||
| const parsedRatio = parseFloat(process.env.OTEL_SAMPLING_RATIO); | ||
| const samplingRatio = Number.isFinite(parsedRatio) ? parsedRatio : 0.01; | ||
|
|
||
| const trustedHosts = buildTrustedHosts(config); | ||
|
|
||
| const ignoreIncomingRequestHook = req => | ||
| req.method === 'OPTIONS' || isHealthPath(req.url); | ||
|
|
||
| sdk = new NodeSDK({ | ||
| resource: resourceFromAttributes({ | ||
| 'service.name': process.env.OTEL_SERVICE_NAME || 'cloudserver', | ||
| 'service.version': process.env.OTEL_SERVICE_VERSION || version, | ||
| 'service.namespace': process.env.OTEL_SERVICE_NAMESPACE || 'scality', | ||
| }), | ||
| traceExporter: new OTLPTraceExporter({ url: endpoint }), | ||
| logRecordProcessors: [], | ||
| metricReaders: [], | ||
| spanLimits: { | ||
| attributeValueLengthLimit: 4096, | ||
| attributeCountLimit: 128, | ||
| eventCountLimit: 128, | ||
| linkCountLimit: 128, | ||
| }, | ||
| sampler: new ParentBasedSampler({ | ||
| root: new TraceIdRatioBasedSampler(samplingRatio), | ||
| }), | ||
| instrumentations: [ | ||
| new HttpInstrumentation({ | ||
| ignoreIncomingRequestHook, | ||
| requestHook: makeRequestHook(trustedHosts), | ||
| }), | ||
| new IORedisInstrumentation({ requireParentSpan: true }), | ||
| // Mask leaf values in db.statement so query shape is captured | ||
| // without user data (object keys, filter values) flowing to | ||
| // the trace backend. | ||
| new MongoDBInstrumentation({ enhancedDatabaseReporting: false }), | ||
| ], | ||
| }); | ||
|
|
||
| sdk.start(); | ||
| } | ||
|
|
||
| // Cap the flush window. The BatchSpanProcessor's default 30s export | ||
| // timeout would otherwise block process.exit, and Kubernetes' default | ||
| // terminationGracePeriodSeconds is also 30s — we'd get SIGKILL'd | ||
| // before the flush ever completed. | ||
| const SHUTDOWN_DEADLINE_MS = 5000; | ||
|
|
||
| async function close() { | ||
| // Capture + clear before awaiting so concurrent callers (SIGTERM | ||
| // during an uncaught-exception flow, for example) don't both call | ||
| // sdk.shutdown() — the SDK doesn't guarantee idempotent shutdown. | ||
| const local = sdk; | ||
| if (!local) { | ||
| return; | ||
| } | ||
| sdk = null; | ||
| try { | ||
| await Promise.race([ | ||
| local.shutdown(), | ||
| // .unref() so the timer doesn't pin the event loop open | ||
| // when sdk.shutdown() resolves first. | ||
| new Promise(resolve => { | ||
| setTimeout(resolve, SHUTDOWN_DEADLINE_MS).unref(); | ||
| }), | ||
| ]); | ||
| } catch (err) { | ||
| // Loggers may already be torn down at this point in shutdown; | ||
| // log to stderr directly. | ||
| // eslint-disable-next-line no-console | ||
| console.error('tracing close failed', err); | ||
| } | ||
| } | ||
|
|
||
| module.exports = { init, close, isEnabled }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But then we will not have the following trace if we have a double-ending ? Why did you introduce what ? Did you encounter that ? If we remove that, what is the impact ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes — encountered during initial unit-test development. The case: a callback-style handler fires the callback (which ends the span via the wrapped-callback closure) and then throws synchronously afterwards (e.g. the callback body itself threw). The outer
try/catchthen callsendSpan(err)and we get two.end()calls. OTEL warns on double-end and the second status overwrites the first, so the trace would record ERROR when the operation actually succeeded. The guard makes the first ending win. Theends span exactly once when callback fires then handler throwstest ininstrumentationSimple.spec.jsexercises exactly this path.