#import "@preview/clean-cnam-template:1.2.0": *
#import "template.typ": *


#show: clean-cnam-template.with(
  title: "PSM CBOR format documentation",
  author: "Olivier Langella",
  class: "technical documentation",
  affiliation: "PAPPSO",
  //logo: image("./assets/cnam_logo.svg"),
  start-date: datetime(day: 26, month: 10, year: 2025),
  main-color: navy,
  default-font: "New Computer Modern Math",
  code-font: "Andale Mono",
  outline-code: outline(
    title: "Table of content",
    depth: 2,
    indent: auto,
  ),
)

#set text(lang: "en")
#set heading(numbering: "1.1")

#show figure.where(kind: "attribute"): set align(start)

#show ref: it => {
    show link: it => {
      show text: body => {
        set text(fill: rgb("#993300"))
        smallcaps(body)
      }
      it
    }
    let el = it.element
    if el.func() == figure  and el.kind == "attribute" {
        link(el.location(),el.supplement)
    } else {
      link(el.location(),el.body)
    }
  }


= Introduction

PSM CBOR file is designed to record any PSM (peptide spectrum match) in a compact binary file, easy to manipulate, versatile, extendable.
This file is used as a stream in any condition, allowing the users to use unix pipes, compression algorithms, network transparency.

This way, from a DDA identification engine search result converted in PSM CBOR, any process can be added :
- Feature computations
- Prediction process (retention times, ion mobility, MS2 prediction...)
- Rescoring
- Filtering

= psm CBOR format

It could be something like that :

#let code_psm = read("psmcbor.json")
#raw(code_psm,lang: "json", block: true)

= Root structure

root sections are required and the order must be respected.



#attribute(parent: "root", tag:"informations",nature: "object", required: true)[]
#attribute(parent: "root", tag:"log",nature: "array", required: true)[
contains an array that logs all the previous "informations" sections. It helps to keep trace of PSM treatments.
]

#attribute(parent: "root", tag:"parameter_map",nature: "dictionary", required: true)[
dictionary where each entry corresponds to a specific process (see @process-entry).
Each entry must contain the parameters used for this process.
]


#attribute(parent: "root", tag:"target_fasta_files",nature: "array", required: true)[
  List of file path to FASTA files (preferably absolute file path) used as reference target protein sequences for the identification engine.
]
#attribute(parent: "root", tag:"decoy_fasta_files",nature: "array", required: false)[
  List of file path to FASTA files (preferably absolute file path) used as decoy protein sequences for the identification engine (if any and not generated on the fly).
]

#attribute(parent: "root", tag:"protein_map",nature: "dictionary", required: true)[
contains an entry for each protein, using the accession as a unique identifier.
]

#attribute(parent: "root", tag:"sample_list",nature: "array", required: true)[
an array of "sample" objects (see @object-sample)
]


= Process entries <process-entry>

If a CBOR PSM process is intended to give new results in "eval" sections : it must have a unique key to define it.
This key must corresponds to an entry (see @attribute-root-parameter_map).

Currently, several entries are already defined, but anyone can create a new one:

/ xtandem: psm and protein evaluations computed by the X!Tandem search engine <entry-xtandem>
/ sage: psm and protein evaluations computed by the Sage search engine <entry-sage>


= Protein description <object-protein>

The protein object is stored in the @attribute-root-protein_map. The protein accession is used as a dictionary key to reference each protein object. Protein accession is used in @object-psm to link PSMs to proteins.

"protein" object is composed of (the order of the elements must be respected) :
#attribute(parent: "protein", tag:"description",nature: "string", required: true)[Protein description, maybe empty]
#attribute(parent: "protein", tag:"sequence",nature: "string", required: true)[Protein amino acid sequence, maybe empty]
#attribute(parent: "protein", tag:"target",nature: "boolean", required: true)[
  Boolean (true by default), true if the protein belongs to the targeted sequences (target FASTA files). False otherwise.
]
#attribute(parent: "protein", tag:"contaminant",nature: "boolean", required: true)[
  Boolean (false by default), true if the protein is tagged as a contaminant protein.
]
#attribute(parent: "protein", tag:"props",nature: "object", required: false)[
  Free structure designed to store any data (not related to a particular process) related to a protein as a propertie.
]
#attribute(parent: "protein", tag:"eval",nature: "object", required: true)[
  Protein values computed by different algorithm or process (described in @process-entry). Maybe empty.
]

= Sample description <object-sample>

"sample" object is composed of :


#attribute(parent: "sample", tag:"name",nature: "string", required: true)[Sample name]
#attribute(parent: "sample", tag:"identification_file_list",nature: "array", required: false)[Identification engine search result files]
#attribute(parent: "sample", tag:"peaklist_file",nature: "object", required: true)[]
#attribute(parent: "sample", tag:"scan_list",nature: "array", required: true)[
an array of "scan" objects (see @object-scan)
]

= Scan description <object-scan>


"scan" object is composed of :

#attribute(parent: "scan", tag:"id",nature: "object", required: true)[Scan identifier]
#attribute(parent: "scan", tag:"precursor",nature: "object", required: true)[Precursor description (MS1 related data)]
#attribute(parent: "scan", tag:"ms2",nature: "object", required: true)[MS2 related data]
#attribute(parent: "scan", tag:"psm_list",nature: "array", required: true)[
an array of "psm" objects (see @object-psm)
]


= PSM description <object-psm>


#attribute(parent: "psm", tag:"proforma",nature: "string", required: true)[Peptide proforma notation]
#attribute(parent: "psm", tag:"protein_list",nature: "array", required: true)[Related proteins, described in @object-protein-link]
#attribute(parent: "psm", tag:"props",nature: "object", required: false)[Peptide properties]
#attribute(parent: "psm", tag: "eval",nature: "dictionary", required: true)[
Peptide values computed by different algorithm or process (described in @process-entry).
]

== Protein link description <object-protein-link>

#attribute(parent: "protein_list", tag:"accession",nature: "string", required: true)[
  Protein accession : must be described in @attribute-root-protein_map
]

#attribute(parent: "protein_list", tag:"positions",nature: "array", required: true)[
  Positions of this peptide (@attribute-psm-proforma) in the protein sequence. Maybe empty if the position is not known.
  
  *Important*: a position starts at 0 (first amino acid) to N-1 (N is the protein length) for the last amino acid. 
]
