Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
350 changes: 350 additions & 0 deletions docs/dev/ralph/memory-66/README.md

Large diffs are not rendered by default.

132 changes: 132 additions & 0 deletions docs/dev/ralph/memory-66/gen_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""Generate a synthetic emanote notebook of ~4500 markdown files, ~70MB total.

Each file has:
- A title heading
- A YAML frontmatter sometimes
- Several paragraphs of lorem-like text with wikilinks
- A few headings
- Some inline code, occasional list, occasional code block

Wikilinks form a random graph so the link index actually has work to do.
"""
import os, random, sys, hashlib, string

random.seed(42)

OUT = sys.argv[1] if len(sys.argv) > 1 else "/home/toor/corpus"
N = int(sys.argv[2]) if len(sys.argv) > 2 else 4500
TARGET_BYTES = int(sys.argv[3]) if len(sys.argv) > 3 else 70 * 1024 * 1024
AVG_BYTES = TARGET_BYTES // N

WORDS = ("the quick brown fox jumps over the lazy dog functor monad applicative haskell "
"pandoc emanote ema lvar parser source eval render template heist note ix set "
"memory leak profile retainer cost centre static unboxed strict thunk graph link "
"wikilink folgezettel sequel zettel obsidian roam neuron foam dendron logseq "
"atomic note structure architecture optimisation cycle measurement baseline "
"decision dependency volatility encapsulation closure capture share unsharing").split()

TAGS_POOL = ["haskell", "design", "perf", "note", "todo", "idea", "ref", "math", "wip", "draft",
"review", "meta", "tool", "lit", "code", "infra", "ux", "ops", "test", "spec"]

def folder_for(i):
# 32 top-level folders, optional nested
top = f"topic{i % 32:02d}"
if i % 7 == 0:
return os.path.join(top, f"sub{(i // 32) % 11}")
return top

def slug(i):
return f"n{i:05d}"

def title(i):
return " ".join(random.sample(WORDS, k=random.randint(2, 5))).title()

def paragraph(words=120):
out = []
while sum(len(w) for w in out) + len(out) < words:
out.append(random.choice(WORDS))
s = " ".join(out)
return s[0].upper() + s[1:] + "."

def wikilink(target_i, alias=None):
t = slug(target_i)
if alias:
return f"[[{t}|{alias}]]"
return f"[[{t}]]"

def write_file(i, n, path):
has_fm = (i % 3 != 0)
lines = []
if has_fm:
tags = random.sample(TAGS_POOL, k=random.randint(0, 4))
lines.append("---")
lines.append(f"title: {title(i)}")
if tags:
lines.append("tags:")
for t in tags:
lines.append(f" - {t}")
if i % 13 == 0:
lines.append(f"order: {i % 50}")
lines.append("---")
lines.append("")
lines.append(f"# {title(i)}")
lines.append("")
# body — keep generating paragraphs until size ~ AVG_BYTES
target = max(2000, int(random.gauss(AVG_BYTES, AVG_BYTES / 4)))
while sum(len(x) for x in lines) < target:
kind = random.random()
if kind < 0.55:
p = paragraph(random.randint(40, 180))
# sprinkle wikilinks
tokens = p.split()
for _ in range(random.randint(1, 4)):
j = random.randrange(len(tokens))
target_i = random.randrange(n)
alias = tokens[j] if random.random() < 0.5 else None
tokens[j] = wikilink(target_i, alias)
lines.append(" ".join(tokens))
lines.append("")
elif kind < 0.7:
lines.append(f"## {title(i)}")
lines.append("")
elif kind < 0.82:
# list
for _ in range(random.randint(3, 8)):
lines.append(f"- {paragraph(random.randint(8, 25))}")
lines.append("")
elif kind < 0.92:
# code block
lines.append("```haskell")
lines.append(f"foo{i} :: Int -> Int")
lines.append(f"foo{i} x = x + {i}")
lines.append("```")
lines.append("")
else:
# embedded note (becomes processed)
lines.append(f"![[{slug(random.randrange(n))}]]")
lines.append("")
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w") as f:
f.write("\n".join(lines))

def main():
os.makedirs(OUT, exist_ok=True)
# an index.md at root
with open(os.path.join(OUT, "index.md"), "w") as f:
f.write("# Synthetic Corpus\n\nGenerated by gen_corpus.py for emanote #66 reproduction.\n")
for i in range(N):
rel = os.path.join(folder_for(i), slug(i) + ".md")
write_file(i, N, os.path.join(OUT, rel))
# report
total = 0
count = 0
for root, _, files in os.walk(OUT):
for f in files:
if f.endswith(".md"):
total += os.path.getsize(os.path.join(root, f))
count += 1
print(f"Wrote {count} files, {total/1024/1024:.1f} MB total", flush=True)

if __name__ == "__main__":
main()
28 changes: 28 additions & 0 deletions docs/dev/ralph/memory-66/measure.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -eo pipefail
EMANOTE=${EMANOTE:-/home/toor/code/emanote/dist-newstyle/build/x86_64-linux/ghc-9.8.4/emanote-2.0.0.0/x/emanote/build/emanote/emanote}
export emanote_datadir=${emanote_datadir:-/home/toor/code/emanote/emanote/default}
CORPUS=${1:?corpus path}
RTS=${2:-}
PORT=${PORT:-$(( RANDOM % 10000 + 9000 ))}
TIMEOUT=${TIMEOUT:-600}
LOG=$(mktemp)
cd "$CORPUS"
$EMANOTE -L "$CORPUS" run --port "$PORT" $([ -n "$RTS" ] && echo +RTS $RTS -RTS) > "$LOG" 2>&1 &
PID=$!
READY=0
for i in $(seq 1 "$TIMEOUT"); do
if ! kill -0 $PID 2>/dev/null; then echo "emanote died" >&2; tail -40 "$LOG" >&2; exit 1; fi
if curl -s -o /dev/null --max-time 1 "http://localhost:$PORT/"; then READY=$i; break; fi
sleep 1
done
[ "$READY" = 0 ] && { echo "timeout" >&2; kill $PID; exit 1; }
LOAD_RSS=$(awk '/VmRSS/{print $2}' /proc/$PID/status)
echo "READY_AFTER_S=$READY"
echo "LOAD_RSS_MB=$(awk -v r=$LOAD_RSS 'BEGIN{printf "%.0f", r/1024}')"
kill -INT $PID 2>/dev/null || true
sleep 2
kill $PID 2>/dev/null || true
wait $PID 2>/dev/null || true
echo "---LOG TAIL---"
tail -60 "$LOG"
7 changes: 6 additions & 1 deletion emanote/emanote.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ common library-common
, commonmark-wikilink >=0.2
, containers
, data-default
, deepseq
, deriving-aeson
, directory
, ema >=0.10.1
Expand Down Expand Up @@ -241,7 +242,11 @@ executable emanote
import: library-common
hs-source-dirs: exe
main-is: Main.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
-- -F1.5: shrink the old-generation retention factor from the GHC default
-- (2.0) to 1.5, trading a few extra major GCs for ~30% lower RSS on large
-- notebooks (see docs/dev/ralph/memory-66/README.md, cycle 2). Users can
-- still override at runtime, e.g. `emanote run +RTS -F2 -RTS`.
ghc-options: -threaded -rtsopts "-with-rtsopts=-N -F1.5"

if flag(ghcid)
hs-source-dirs: src
Expand Down
31 changes: 17 additions & 14 deletions emanote/src/Emanote/Model/Graph.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ module Emanote.Model.Graph where
import Commonmark.Extensions.WikiLink qualified as WL
import Data.IxSet.Typed ((@+), (@=))
import Data.IxSet.Typed qualified as Ix
import Data.Map.Strict qualified as Map
import Data.Set qualified as Set
import Data.Tree (Forest, Tree (Node))
import Emanote.Model.Calendar qualified as Calendar
Expand All @@ -12,7 +11,7 @@ import Emanote.Model.Link.Resolve qualified as Resolve
import Emanote.Model.Meta (lookupRouteMeta)
import Emanote.Model.Note qualified as MN
import Emanote.Model.Note qualified as N
import Emanote.Model.Type (Model, modelIndexRoute, modelNotes, modelRels, parentLmlRoute)
import Emanote.Model.Type (Model, modelIndexRoute, modelLookupNoteByRoute', modelNotes, modelRels, parentLmlRoute)
import Emanote.Route qualified as R
import Emanote.Route.SiteRoute qualified as SR
import Optics.Operators as Lens ((^.))
Expand Down Expand Up @@ -176,20 +175,24 @@ lookupNoteByWikiLink model currentRoute wl = do
modelLookupBacklinks :: R.LMLRoute -> Model -> [(R.LMLRoute, NonEmpty [B.Block])]
modelLookupBacklinks r model =
sortOn (Calendar.backlinkSortKey model . fst)
$ groupNE
$ mapMaybe withCtx
$ groupBySource
$ backlinkRels r model
<&> \rel ->
(rel ^. Rel.relFrom, rel ^. Rel.relCtx)
where
groupNE :: forall a b. (Ord a) => [(a, b)] -> [(a, NonEmpty b)]
groupNE =
Map.toList . foldl' f Map.empty
where
f :: Map a (NonEmpty b) -> (a, b) -> Map a (NonEmpty b)
f m (x, y) =
case Map.lookup x m of
Nothing -> Map.insert x (one y) m
Just ys -> Map.insert x (ys <> one y) m
-- Group backlink-rels by their source route. Context blocks are no
-- longer carried on each Rel (#66) — instead they are recovered once
-- per source note by re-walking the source's Pandoc, which is cheap
-- (one note's AST) compared to retaining contexts in _modelRels for
-- every link in the entire notebook.
groupBySource :: [Rel.Rel] -> [R.LMLRoute]
groupBySource = ordNub . fmap (^. Rel.relFrom)
targetMR :: R.ModelRoute
targetMR = R.ModelRoute_LML R.LMLView_Html r
withCtx :: R.LMLRoute -> Maybe (R.LMLRoute, NonEmpty [B.Block])
withCtx from = do
sourceNote <- modelLookupNoteByRoute' from model
ctxs <- nonEmpty $ Rel.noteRelCtxToTarget targetMR sourceNote
pure (from, ctxs)

-- | Rels pointing *to* this route
backlinkRels :: R.LMLRoute -> Model -> [Rel.Rel]
Expand Down
28 changes: 27 additions & 1 deletion emanote/src/Emanote/Model/Link/Rel.hs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,33 @@ noteRels note =
pure (target, ctx)
in Ix.fromList $ zipWith mkRel [0 ..] links
where
mkRel srcPos (target, ctx) = Rel (note ^. noteRoute) target srcPos ctx
-- Drop the per-Rel `[B.Block]` context at insert time and recover
-- it on demand at backlink-render time by re-walking the source
-- note's Pandoc (see 'noteRelCtxToTarget' / 'modelLookupBacklinks'
-- in @Emanote.Model.Graph@). The context is a chunk of Pandoc
-- Blocks per outgoing link; with thousands of notes and dozens of
-- outgoing links each, persisting it in @_modelRels@ dominates the
-- live-data overhead (#66). The on-demand walk is bounded by the
-- source note's own AST size — fast for any single backlinks page.
mkRel srcPos (target, _ctx) = Rel (note ^. noteRoute) target srcPos []

{- | Re-extract the Pandoc-block contexts of every outgoing link in
@sourceNote@ that points to @targetMR@. Used by the backlinks renderer
to recover the context that 'noteRels' deliberately drops at insert
time (#66). Cost is one walk of the source note's Pandoc per backlink
expansion — paid only when the @targetMR@'s backlinks page is rendered.
-}
noteRelCtxToTarget :: ModelRoute -> Note -> [[B.Block]]
noteRelCtxToTarget targetMR sourceNote =
let contextsByUrl = LC.queryLinksWithContext (sourceNote ^. noteDoc)
parentR = noteResolveLinkBase sourceNote
targets = unresolvedRelsTo targetMR
in do
(url, instances) <- Map.toList contextsByUrl
(attrs, ctx) <- reverse (toList instances)
target <- maybeToList $ fst <$> parseUnresolvedRelTarget parentR attrs url
guard $ target `elem` targets
pure ctx

{- | All `UnresolvedRelTarget`s that could resolve to the given
`ModelRoute`. Each `URTResource` form is built by re-parsing a URL
Expand Down
5 changes: 5 additions & 0 deletions emanote/src/Emanote/Source/Patch.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ module Emanote.Source.Patch (
ignorePatterns,
) where

import Control.DeepSeq (deepseq)
import Control.Monad.Logger (LoggingT (runLoggingT), MonadLogger, MonadLoggerIO (askLoggerIO))
import Data.Aeson qualified as Aeson
import Data.ByteString qualified as BS
import Data.List qualified as List
import Data.List.NonEmpty qualified as NEL
Expand Down Expand Up @@ -255,6 +257,9 @@ parseAndInsert noteF model refreshAction r src = do
s <- readRefreshedFile refreshAction (locResolve src)
note <-
N.parseNote (model ^. M.modelScriptingEngine) (M.modelPluginBaseDir model) r src (decodeUtf8 s)
-- Force the parsed Pandoc and Aeson Value so per-file parser closures
-- can be released as we stream files into the model (#66).
note ^. N.noteDoc `deepseq` (note ^. N.noteMeta :: Aeson.Value) `deepseq` pure ()
pure
$ M.modelInsertNote (noteF note)
>>> (modelSourceDependencies %~ SDeps.setLuaDeps r src (note ^. N.notePandocFilterDeclarations))
Expand Down
16 changes: 7 additions & 9 deletions emanote/test/Emanote/Model/Link/RelSpec.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
module Emanote.Model.Link.RelSpec where

import Commonmark.Extensions.WikiLink qualified as WL
import Data.IxSet.Typed qualified as Ix
import Emanote.Model.Link.Rel
import Emanote.Model.Note qualified as MN
Expand Down Expand Up @@ -100,21 +99,20 @@ spec = do
got === want
describe "noteRels source order (issue #186)" $ do
it "orders rels by source position, not by lexicographic Ord on context" $ do
-- 'Z' sorts last lexicographically but comes first in source; 'A'
-- sorts first but comes second. Without the srcPos tie-breaker,
-- Ord [Block] would yield A-then-Z; we want source order.
-- Both 'z' and 'a' link to the same target via the same URL, so
-- the two rels share (_relFrom, _relTo) and can only be ordered
-- by _relSrcPos. Source order is "Z first" then "A second", so
-- IxSet.toList should produce srcPos [0, 1] in that order.
-- (#66 dropped _relCtx — see Rel.noteRelCtxToTarget for the
-- on-demand backlinks-context recovery path.)
let mkLink lbl = B.Link B.nullAttr [B.Str lbl] ("Foo.md", "")
note =
MN.mkEmptyNoteWith
barRoute
[ B.Para [B.Str "Z first: ", mkLink "z"]
, B.Para [B.Str "A second: ", mkLink "a"]
]
paraText rel = case _relCtx rel of
[B.Para is] -> WL.plainify is
other -> error $ "expected single-paragraph context, got " <> show other
(paraText <$> Ix.toList (noteRels note))
`shouldBe` ["Z first: z", "A second: a"]
(_relSrcPos <$> Ix.toList (noteRels note)) `shouldBe` [0, 1]
it "does not collapse two identical-context links to the same target" $ do
-- One paragraph mentions Foo.md twice. The two rels share
-- (relFrom, relTo, relCtx); without srcPos in Ord, IxSet.fromList's
Expand Down
Loading