Publish "Using Software Heritage for technical writing"

This commit is contained in:
Gabriel Arazas 2023-05-03 13:33:37 +08:00
parent faec906446
commit 1a7f15dbd4
No known key found for this signature in database
GPG Key ID: ADE0C41DAB221FCC
10 changed files with 521 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 231 KiB

View File

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg version="1.1" viewBox="0 0 919.03 218.34" xmlns="http://www.w3.org/2000/svg">
<g transform="translate(5.0002 6.1078)" font-size="37.333px">
<text x="-1.7504619" y="117.07249" stroke-width="0" xml:space="preserve"><tspan x="-1.7504619" y="117.07249"><tspan fill="var(--base05)">swh</tspan><tspan fill="var(--base0F)">:</tspan><tspan fill="var(--base0A)">1</tspan><tspan fill="var(--base0F)">:</tspan><tspan fill="var(--base09)">rev</tspan><tspan fill="var(--base0F)">:</tspan><tspan fill="var(--base0B)">c4f3a3707104999d5b6fe4c4e5c3833980a92513</tspan></tspan></text>
<g transform="translate(140.37 -73.863)" fill="var(--base0B)">
<path class="UnoptimicedTransforms" transform="translate(40.573,-4.3625)" d="m366.8 203.49c0.68831 0.11851 0.50863 4.0327 0.64628 10.351 0.1326 6.086 0.67133 15.288 4.5676 23.765 1.0278 2.2912 2.3211 4.537 3.8789 6.6904 3.1363 4.3354 6.9792 7.7989 10.937 10.379h1e-5c6.0542 3.9566 12.442 6.1096 16.909 7.4197 2.4787 0.72403 4.4984 1.2224 5.9156 1.6257 0.72999 0.2077 1.3061 0.39294 1.6922 0.56247 0.19731 0.0866 0.34858 0.17101 0.4501 0.25329 0.0507 0.0415 0.0895 0.0829 0.11555 0.12398 0.0128 0.0616 0.0227 0.0932 0.0295 0.10942 7e-3 0.0156 0.0102 0.0168 0.0107 0.0169h1e-5c3e-5 0 0 0 0 0s3e-5 0 0 0h-1e-5c-4.8e-4 7e-5 -4e-3 1e-3 -0.0111 0.0166-7e-3 0.016-0.0179 0.0473-0.0323 0.10848v1e-5c-0.0281 0.0403-0.0699 0.0798-0.12523 0.11869-0.10906 0.0773-0.26966 0.15077-0.4808 0.2194-0.41664 0.13541-1.023 0.24873-1.8111 0.32465-1.5191 0.14638-3.6999 0.1526-6.4096-0.14413-4.9136-0.53319-11.962-2.1167-19.039-6.2093-4.6603-2.7034-9.1681-6.5561-12.79-11.571-1.7963-2.4871-3.2641-5.1095-4.4023-7.7983v-1e-5c-4.1273-9.9763-3.942-19.846-3.0958-26.221 0.8715-6.5655 2.3571-10.259 3.0454-10.141z"/>
<text x="459.50897" y="270.69006" stroke-width="0" xml:space="preserve"><tspan x="459.50897" y="270.69006">object identifier</tspan></text>
<path class="UnoptimicedTransforms" transform="translate(-12.77,19.743)" d="m461.14 230.01c1.5758-0.7359 4.5503 3.8007 7.3638 10.855-0.22246 0.28921-0.45011 0.5816-0.68242 0.87604-4.9296 6.248-9.4729 10.128-10.835 8.913-1.3914-1.2411 1.6339-5.7283 5.9285-9.8802 0.22035-0.21302 0.44144-0.4237 0.66262-0.63184-2.636-4.7018-4.0467-9.3804-2.4374-10.132z"/>
</g>
<g transform="translate(-28.311 -73.863)" fill="var(--base0A)">
<path class="UnoptimicedTransforms" d="m115.63 194.84c0.20917-0.0609 0.64911 0.42288 1.266 1.3792 0.42884 0.65636 1.0138 1.627 1.6572 2.7337 1.2552 2.1584 2.8869 5.0753 4.8777 8.5584 3.6713 6.4204 8.6208 14.941 14.746 23.697 0.99393 1.4137 2.0198 2.8468 3.0734 4.2732 5.3388 7.2274 10.32 12.796 15.887 17.244h1e-5c3.6779 2.9308 6.9454 4.8468 10.184 6.0331 6.2822 2.3166 10.771 1.2663 10.944 1.9037 0.17263 0.63741-4.3424 3.17-11.919 1.4181-3.8017-0.88281-7.8172-2.7093-11.98-5.6591-6.2529-4.4172-11.922-10.3-17.422-17.762-1.093-1.4827-2.1365-2.9687-3.1323-4.4377-6.2319-9.1479-10.787-18.075-13.815-24.983-1.6514-3.7647-2.8266-6.8858-3.5954-9.3386-0.40185-1.2821-0.67326-2.3196-0.82982-3.1507-0.21778-1.168-0.201-1.8332 0.0585-1.9088z"/>
<text x="184.00632" y="270.69006" stroke-width="0" xml:space="preserve"><tspan x="184.00632" y="270.69006">scheme version</tspan></text>
<path class="UnoptimicedTransforms" d="m171.33 251.09c1.2325-0.91972 5.0462 2.949 9.1973 9.074-0.0865 0.22193-0.17472 0.44561-0.26456 0.67073-2.833 7.0998-5.912 12.008-7.3566 11.348-1.4688-0.67014 0.0695-5.754 2.8417-11.004 0.0947-0.17937 0.18996-0.35775 0.28559-0.53505h-1e-5c-3.5716-4.2658-5.9559-8.6199-4.7034-9.5545z"/>
</g>
<g transform="translate(-28.311 -73.863)" fill="var(--base05)">
<path class="UnoptimicedTransforms" d="m36.825 161.53c-0.75323 0.18123-2.5018-4.1274-4.0789-11.284h-1e-6c-1.1861-5.3728-2.3055-12.517-2.4493-20.371 1e-6 0 0 0 0 0-0.01885-0.84843-0.02708-1.7104-0.02141-2.5824 0.04508-6.9262 0.97568-12.958 3.0964-18.285h1e-6c1.3022-3.2896 3.0469-5.8724 5.0322-7.7587 4.3733-4.1214 7.9458-3.3196 8.0046-3.102 0.16062 0.59497-2.9163 1.6628-5.6085 5.2783-1.3676 1.8088-2.4517 4.0514-3.3722 7.023h-1e-6c-1.514 4.9139-2.0397 10.229-2.1078 16.887-0.0085 0.83043-0.01004 1.6581-0.0076 2.4762 0.05249 7.4308 0.69735 14.546 1.1725 19.855 0.6429 7.1272 1.0464 11.694 0.33987 11.864z"/>
<text x="56.50531" y="101.82629" stroke-width="0" xml:space="preserve"><tspan x="56.50531" y="101.82629">prefix</tspan></text>
<path class="UnoptimicedTransforms" transform="translate(5.7253,-3.2291)" d="m32.075 93.683c0.80362-0.99123 5.3284 1.7022 10.815 6.32h1e-6c0.03079 0.453 0.05894 0.91466 0.08401 1.3831 0.41152 7.6885-0.23057 13.623-1.5883 13.659-1.3682 0.0362-2.293-5.5067-2.2663-11.959 0.0017-0.41411 0.0068-0.82504 0.01481-1.2317l-1e-6 -1e-5c-4.544-3.5233-7.8674-7.1748-7.0594-8.1714z"/>
</g>
<g transform="translate(-28.311 -73.863)" fill="var(--base09)">
<path class="UnoptimicedTransforms" d="m152.43 165.36c-0.0234-0.0101-0.0448-0.0242-0.0646-0.0425-0.0191-0.0177-0.0365-0.0393-0.0521-0.0648-0.0312-0.0508-0.0554-0.11687-0.0727-0.19789-0.0343-0.16203-0.0403-0.38294-0.0181-0.66091 0.0444-0.55421 0.20118-1.3263 0.47551-2.3021 0.5442-1.9357 1.5328-4.6133 3.0256-7.8973 2.8964-6.3737 7.6261-14.908 14.555-23.907 3.4424-4.4772 7.4226-9.0443 11.968-13.413 4.6167-4.4363 9.3748-8.2426 14.203-11.446 9.8659-6.5479 19.218-9.8136 26.263-11.232 6.9253-1.3943 11.136-1.0428 11.237-0.28679 0.1008 0.75598-3.8498 1.8066-10.113 4.2001-6.4361 2.4595-14.666 6.2254-23.664 12.708v1e-5c-4.4074 3.176-8.7801 6.8269-13.144 11.028-4.2884 4.128-8.1636 8.3977-11.634 12.543-6.9869 8.3583-12.239 16.117-16.041 21.819-1.9112 2.867-3.5192 5.3144-4.632 6.8567-0.5792 0.80278-1.0632 1.4168-1.4418 1.806-0.19116 0.19651-0.35784 0.33797-0.49868 0.42074-0.0708 0.0417-0.13531 0.0686-0.19341 0.0804-0.0291 6e-3 -0.0567 8e-3 -0.0826 6e-3 -0.0268-2e-3 -0.0517-8e-3 -0.0751-0.0177z"/>
<text x="239.05254" y="101.45296" stroke-width="0" xml:space="preserve"><tspan x="239.05254" y="101.45296">object type</tspan></text>
<path class="UnoptimicedTransforms" d="m226.67 82.279c1.0719-0.70804 5.0827 4.2614 9.5233 11.5v1e-6c-0.1043 0.21584-0.20991 0.43306-0.31675 0.65143-3.8511 7.871-7.5677 13.513-8.733 12.892-1.1792-0.62925 1.278-6.4891 4.9858-12.816 0.1084-0.18497 0.21692-0.369 0.32545-0.55198v-1e-6c-4.0636-5.6627-6.8692-10.959-5.7848-11.675z"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 6.2 KiB

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg width="973.56" height="296.44" font-size="37.333px" version="1.1" viewBox="0 0 973.56 296.44" xmlns="http://www.w3.org/2000/svg">
<text x="4.5522251" y="111.58363" xml:space="preserve"><tspan x="4.5522251" y="111.58363"><tspan fill="var(--base05)">swh:1:rev:c4f3a3707104999d5b6fe4c4e5c3833980a92513</tspan><tspan fill="var(--base07)">;</tspan></tspan><tspan x="4.5522251" y="158.24988" fill="var(--base0B)">origin=https://github.com/nix-community/home-manager<tspan fill="var(--base07)">;</tspan></tspan><tspan x="4.5522251" y="204.91612" fill="var(--base0B)">visit=swh:1:snp:c452aec4a5c8fd93484cc4215c20da568f3fbbc8</tspan></text>
<g transform="translate(-310.19 86.855)" fill="var(--base05)">
<path class="UnoptimicedTransforms" d="m762.79-13.433c-0.35699-0.53054 4.5623-4.5796 12.909-10.641 8.0719-5.8612 20.417-14.332 33.879-21.876 13.25-7.4243 24.839-12.434 34.903-15.745 6.7819-2.2527 11.146-3.0368 14.957-2.9137 1.3593 0.04064 2.301 0.23202 2.9633 0.49919 0.28618 0.11518 0.48692 0.23531 0.61181 0.34856v1e-6c0.0625 0.05514 0.10351 0.10522 0.12468 0.14999v1e-6c-5e-3 0.05038-4e-3 0.07827-1e-3 0.09365 2e-3 0.0147 6e-3 0.01789 7e-3 0.01836 7e-5 5e-5 4e-5 3.1e-5 4e-5 3e-5s3e-5 1.7e-5 -5e-5 -1.3e-5c-7.6e-4 -2.74e-4 -5e-3 -0.0013-0.0181 0.0064-0.0133 8e-3 -0.0356 0.02549-0.0715 0.06297v1e-6c-0.0408 0.01727-0.0976 0.03304-0.16226 0.04565-0.12261 0.01998-0.37263 0.04258-0.62119 0.06127-0.78873 0.05945-1.3229 0.10859-2.7169 0.3559-3.9404 0.70873-7.0172 1.6873-13.882 4.4911-9.8661 4.0648-20.541 9.1084-33.826 16.567-12.515 7.0255-26.544 15.843-34.325 20.571-8.793 5.3436-14.373 8.4351-14.73 7.9046z"/>
<path class="UnoptimicedTransforms" d="m855.62-76.438c1.1722-0.71538 5.3348 4.6591 9.6926 12.273 0.1682 0.29389 0.33373 0.58582 0.49642 0.87525v1e-6c-2.9337 7.8611-5.9321 13.383-7.1582 12.892-1.2354-0.4952 0.29172-6.2096 3.0385-12.547v-1e-6c-0.16489-0.24735-0.32979-0.49691-0.49445-0.74844-4.0597-6.2017-6.7573-12.023-5.5748-12.744z"/>
<text x="871.37671" y="-54.825691" xml:space="preserve"><tspan x="871.37671" y="-54.825691">core identifier</tspan></text>
</g>
<g transform="translate(4.5481 68.906)" fill="var(--base0B)">
<path class="UnoptimicedTransforms" d="m599.69 151.01c0.47293-0.35412 4.0022 3.691 10.043 10.003 5.9778 6.2462 14.728 14.9 25.824 22.505 1.8362 1.2586 3.7395 2.4909 5.7066 3.6806 8.1333 4.9189 15.995 8.2886 23.497 10.435 8.45 2.4177 15.321 3.0458 20.576 3.1539 2.6118 0.0539 4.3942-0.0183 5.8684-0.056 0.68255-0.0174 1.2333-0.0241 1.5965-1e-3 0.18583 0.0116 0.33158 0.0314 0.43347 0.0609 0.0611 0.0504 0.10367 0.077 0.1327 0.0935 0.0169 0.0169 0.0284 0.0248 0.0357 0.0284l0.0107 3e-3s1e-5 0 1e-3 6e-3c-0.0137 9e-3 -0.0196 0.0221-0.0258 0.0452-0.0171 0.0286-0.0425 0.0739-0.072 0.14722-0.0782 0.0763-0.20128 0.16177-0.36954 0.25511-0.33691 0.18688-0.84314 0.39853-1.5254 0.61944-1.3683 0.44299-3.352 0.89757-5.9699 1.1996-5.374 0.61943-12.795 0.53842-21.728-1.6042-7.9699-1.9121-16.378-5.2842-24.862-10.416-2.0548-1.2429-4.0321-2.5395-5.9285-3.8722v-1e-5c-11.487-8.073-20.042-17.342-25.527-24.257-5.5256-6.9664-8.1906-11.674-7.7177-12.028z"/>
<path class="UnoptimicedTransforms" d="m685.62 183.68c1.1736-0.4735 5.0376 7.0927 9.1893 17.701-0.36711 0.75265-0.74526 1.5187-1.133 2.2941-5.5727 11.144-10.798 19.453-12.004 18.825-1.2111-0.63058 2.4509-9.364 7.8101-19.229 0.38451-0.70776 0.76882-1.4061 1.151-2.0926-3.8356-9.1977-6.1904-17.024-5.0134-17.499z"/>
<text x="701.56409" y="211.01204" xml:space="preserve"><tspan x="701.56409" y="211.01204">qualifiers</tspan></text>
</g>
</svg>

After

Width:  |  Height:  |  Size: 3.6 KiB

View File

@ -0,0 +1,26 @@
// All of the SWHIDs.
:swh-system76-firmware-license: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2;origin=https://github.com/pop-os/system76-firmware;lines=471-538
:swh-nixpkgs-22-11: swh:1:rev:db1e4eeb0f9a9028bcb920e00abbc1409dd3ef36;origin=https://github.com/NixOS/nixpkgs;visit=swh:1:snp:857ce072b5dbf50f1ae55d8233cb321dd42b5992
:swh-nixpkgs-22-11-maintainers-dir: swh:1:dir:101a60787ec70986789c64d2379be174ed73e2e5;origin=https://github.com/foo-dogsquared/nixpkgs;visit=swh:1:snp:857ce072b5dbf50f1ae55d8233cb321dd42b5992;anchor=swh:1:rev:db1e4eeb0f9a9028bcb920e00abbc1409dd3ef36;path=/maintainers/
:swh-gnome-shell-3-38-6: swh:1:rel:8763b71ed3a51974c61edb7781832a50b176f966;origin=https://gitlab.gnome.org/GNOME/gnome-shell;visit=swh:1:snp:54081c29aa31e4a626a06b70e2a8571fad83e092
:swh-gnome-shell-jan-4-2023: swh:1:snp:fc3c21b5f61d1e283ba9ec52f632c372675eaebc;origin=https://gitlab.gnome.org/GNOME/gnome-shell
:swh-full-qualifiers-example: swh:1:rev:c4f3a3707104999d5b6fe4c4e5c3833980a92513;origin=https://github.com/nix-community/home-manager;visit=swh:1:snp:c452aec4a5c8fd93484cc4215c20da568f3fbbc8
// The previous list of SWHIDs but only with the core identifier. This is used
// first before we eventually used the above list of SWHIDs for introducing
// SWHID qualifiers.
:swh-bare-system76-firmware-license: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2
:swh-bare-nixpkgs-22-11: swh:1:rev:db1e4eeb0f9a9028bcb920e00abbc1409dd3ef36
:swh-bare-nixpkgs-22-11-maintainers-dir: swh:1:dir:101a60787ec70986789c64d2379be174ed73e2e5
:swh-bare-gnome-shell-3-38-6: swh:1:rel:8763b71ed3a51974c61edb7781832a50b176f966
:swh-bare-gnome-shell-jan-4-2023: swh:1:snp:fc3c21b5f61d1e283ba9ec52f632c372675eaebc
// Invalid SWHIDs.
:swhid-content-with-invalid-origin: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2;origin=https://github.com/nonexistentuser/nonexistentrepo
:swhid-content-with-invalid-path: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2;origin=https://github.com/pop-os/system76-firmware;path=hello/COPYING
// The rest of the attributes.
:git-ref-foodogsquared-website: e75ecbb866a16e2a94d21b0921e5a5101069abfc
:gnome-gitlab: gitlab.gnome.org
:asciidoc-go-template-hugo-featuring-nix-post: ../2023-04-10-asciidoc-go-template-and-hugo-featuring-nix/index.adoc

View File

@ -0,0 +1,454 @@
---
title: "Using Software Heritage for technical writing"
date: 2023-05-12T00:00:00+00:00
---
= Using Software Heritage for technical writing
Gabriel Arazas <foodogsquared@foodogsquared.one>
v1.0.0, 2023-05-12: Initial version
include::./attributes.adoc[]
There's this nice service called link:https://www.softwareheritage.org/[Software Heritage], a large software archive intended to be used as a reference for software in writings (e.g., research papers, blog posts, technical documents).
We'll be showing how to take advantage of it for technical writing as it provides the following benefits.
- It offers a centralized and universal way of identifying and referring to software similarly to wikipedia:Digital_object_identifiers[Digital object identifers] (DOI).
- link:https://archive.softwareheritage.org/#swh-coverage-content[It consolidates all of the sources into one centralized archive], reducing the need to search and manage to different forges such as GitHub, GitLab, Bitbucket, and Sourcehut.
- Long-term preservation which mitigates against problems such as vanishing upstreams and sunsetting services.
For example, when referring to github:NixOS/nixpkgs[rev=nixos-22.11] and if ever GitHub goes down or if the Nix community decides to move into other Git forges, it will affect none of it once swh:{swh-nixpkgs-22-11}[the software has been archived within Software Heritage] and can be referred for the rest of time.
- It offers granularity to what part of the software technical writers can refer to from the whole project, to a certain point in history, to certain files and directories, and all the way down to lines of code.
[#how-software-heritage-works]
== How Software Heritage works?
Software Heritage link:https://archive.softwareheritage.org/coverage[actively archives software from several sources] such as...
- Software forges like GitHub, GitLab instances, and even Gitea instances.
- Linux distributions package archives such as from Debian, Nix, and Guix.
- Several software indices such as PyPi, crates.io, and npm.
The service will periodically capture a snapshot of the same software project (which we have access to, among other things as we'll see later in the post).
[NOTE]
====
One thing you have to keep in mind with this service is the project developers don't check the source code for any issues (e.g., quality, intent).
Whatever that is stored from the original source will be included as part of the archive.
====
Furthermore, link:https://archive.softwareheritage.org/save/[it can save source code that is managed by different version control software] such as Git, Mercurial, and Subversion.
Software Heritage is also offering its services with its link:https://archive.softwareheritage.org/api/1/[API over HTTPS] which is nice if you want to create some neat little scripts or integrate it in your software.
Most of the functionality are already available with its website which I covered in <<using-the-software-heritage-archive-website, a later section>>.
But first, we'll have to learn an important concept with Software Heritage: its identifier system to access the archive in the first place.
[#sidebar:dialog-on-archive-statistics]
.Dialog on archive statistics
****
[chat, Ezran, state=proud, role=reversed]
====
As of 2023-05-11, Software Heritage contains at least 230 million projects with 3.2 billion commits.
It is only expected to go up throughout the years.
GitHub takes the majority of the sources as the archive have 175 million projects from it.
====
[chat, foodogsquared, state=curious]
====
Does that include forks and everything?
Also, that's a bit scary to think about GitHub having 75% of the archive.
What about the second largest?
====
[chat, Ezran, role=reversed]
====
I'm fairly sure it does include forks in the archive which makes the actual number of projects a lot less but it is still an impressive count considering the wide coverage of sources they monitor.
They only archive public repos or if the user opt in of the archival integration such as link:https://docs.github.com/en/enterprise-cloud@latest/get-started/privacy-on-github/opting-into-or-out-of-the-github-archive-program-for-your-public-repository[in GitHub].
As for the second largest source, it seems to come from GitLab instances totalling... 4 million projects.
That's at least 1%.
====
[chat, foodogsquared, state=disappointed]
====
Well, that's at least... unfortunate.
====
****
[#its-identifier-system]
== Its identifier system
The main intention of the project is to provide a centralized archive for identifying and referencing software.
The primary way of using such service is with an identifier system like wikipedia:Digital_object_identifiers[DOI] and wikipedia:ISBN[].
Software Heritage uses its own identifier system called link:https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html[SoftWare Heritage persistent IDentifier] or SWHID for short.
The identifier follows a certain format.
.Parts of SWHID
image::./assets/swhid-parts.svg[Parts of SWHID with colorized text, height=100%, width=100%, opts=inline]
The following examples should be enough to show what they look like.
[#tbl:swhid-list]
.Examples of SWHIDs
[%header.break-anywhere,cols="2,1"]
|===
| SWHID
| Description
| swh:{swh-bare-system76-firmware-license}[]
| GPLv3 document.
| swh:{swh-bare-nixpkgs-22-11-maintainers-dir}[]
| `maintainers/` directory from github:NixOS/nixpkgs[opts=repo].
| swh:{swh-bare-nixpkgs-22-11}[]
| 22.11 branch of github:NixOS/nixpkgs[opts=repo].
| swh:{swh-bare-gnome-shell-3-38-6}[]
| gitlab:GNOME/gnome-shell[GNOME Shell v3.38.6, domain={gnome-gitlab}, rev=3.38.6] release.
| swh:{swh-bare-gnome-shell-jan-4-2023}[]
| A gitlab:GNOME/gnome-shell[opts=repo] snapshot.
|===
As you can tell from the table, SWHID also offers some control of granularity of what parts of software we want to refer: from individual files and directories, from a certain point in the history of the project, and from a certain point of time of capture.
[NOTE]
====
The parts of software such as files, directories, and revisions are collectively referred to as software artifacts (or objects) as you'll see from its documentation.
====
[chat, foodogsquared, state=curious]
====
What about pointing to specific lines of code?
Didn't the preamble mentioned something like "granularity down to the lines of code"?
====
[chat, Ezran, role=reversed]
====
You'll see it later.
====
An interesting property with SWHIDs is that they are intrinsic identifiers: meaning you get the object alongside the identifier.
Unlike DOIs and ISBNs where objects are arbitrarily assigned by a central authority, SWHIDs are computationally generated from the object.
This means SWHIDs are deterministic and we can do a reverse lookup with the object.
In fact, it can be computed with objects locally in your machine.
// TODO: fig: create a picture of levels of granularity
Due to its intrinsic nature of SWHIDs and with the ability to refer various parts of a software, we're also slowly unraveling the fact that Software Heritage archive itself is essentially a gigantic wikipedia:Merkle_tree[Merkle tree] where it contains several objects.
Let's go back to the <<tbl:swhid-list, previous table of SWHIDs again>> and see what those are.
- A content object contains the content of a file.
- A directory object contains other directory objects and content objects.
- A revision object is a point in time of the development history of the project.
It also points to the root directory of the project.
- A release object is essentially the same as a revision object but with additional metadata.
In practice, this is typically the revision developers tagged for release (e.g., KDE Plasma 5.23, GNOME 42, Linux kernel 6.3).
- A snapshot object contains the whole source code including all visible branches at that point in time.
[#sidebar:similarities-with-git]
.Similarities with Git
****
[chat, foodogsquared]
====
Wait... this sounds similar to the link:https://git-scm.com/book/en/v2/Git-Internals-Plumbing-and-Porcelain[Git internals].
====
[chat, Ezran, role=reversed]
====
That's because it is using a similar data model as Git with the graph objects and even the object identifier being a hex-encoded SHA1 hash.
====
[chat, foodogsquared, state=curious]
====
Does this mean it is compatible with Git then?
====
[chat, Ezran, role=reversed]
====
Yes but it is more coincidental than anything.
This is especially clearer once you noticed the service supports importing software from version control software other than Git.
Just don't expect that to work every time.
====
****
== SWHID qualifiers
While the <<tbl:swhid-list, previously shown SWHIDs>> is enough and working as intended, there are some lack of information with the identifier alone.
From the identifier system, one cannot easily infer certain information that we often needed such as the URL of the repository and the path relative to the repository.
This is also reflected in the website interface if you've visited the links where it just strictly presents the software artifact (e.g., content, directory, revision).
- Let's take swh:{swh-nixpkgs-22-11-maintainers-dir}[] as an example where we just see a directory and nothing else.
- Or let's take the swh:{swh-nixpkgs-22-11}[] where we see visit a revision of the project but we cannot see if it came from github:NixOS/nixpkgs[the canonical repository].
- With yet another example, let's take swh:{swh-bare-system76-firmware-license}[] where the exact content object can appear for GPLv3-licensed projects. footnote:[You cannot modify the GPLv3 document itself since it is a copyrighted document so any GPL-licensed projects should have the same license text thus the same object.]
Not to mention we can't tell where the license file is located in the repository, let alone the repository.
This is because of the data model of the archive being a gigantic Merkle tree where objects may be shared among multiple projects.
This makes certain tasks to be tedious such as identifying whether the artifact belong from a canonical repository or one of its many forks which is also included in the archive.
Because of this, SWHIDs may also have a semicolon-delimited (`;`) list of qualifiers that adds contextual information.
.SWHID with qualifiers
image::./assets/swhid-with-qualifiers.svg[A SWHID with colorized parts, height=100%, width=100%, opts=inline]
Each qualifier may mean different things which is https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#qualifiers[documented nicely in its website].
Let's take the <<tbl:swhid-list, previous table>> and add more contextual information with it.
[#tbl:swhid-with-qualifiers]
.Previous SWHIDs with contextual information
[%header.break-anywhere,cols="2,1"]
|===
| SWHID
| Description
| swh:{swh-system76-firmware-license}[opts=full]
| Section 11 of the GPLv3 license from github:pop-os/system76-firmware[].
| swh:{swh-nixpkgs-22-11-maintainers-dir}[opts=full]
| `maintainers/` directory from the nixpkgs-22.11 branch from the canonical nixpkgs repository.
| swh:{swh-nixpkgs-22-11}[opts=full]
| A certain revision from the nixpkgs-22.11 branch from the canonical nixpkgs repository.
| swh:{swh-gnome-shell-3-38-6}[opts=full]
| The gitlab:GNOME/gnome-shell[canonical GNOME Shell v3.38.6 release, domain={gnome-gitlab}, rev=3.38.6].
| swh:{swh-gnome-shell-jan-4-2023}[opts=full]
| A snapshot of the gitlab:GNOME/gnome-shell[canonical gnome-shell repository, domain={gnome-gitlab}] captured on January 4th of 2023.
|===
If you click each of the link, the website interface is more complete compared to the <<tbl:swhid-list, previous table of SWHIDs>>.
[#fig:side-to-side-comparison-swhid-with-and-without-qualifiers]
.Side-to-side comparison of the website interface for `{swh-bare-system76-firmware-license}` with and without qualifiers
image::./assets/archive-website-swhid-cnt-side-to-side.png[Side-to-side comparison of the website interface in {swh-bare-system76-firmware-license} with and without qualifiers]
This practice of adding contextual information is recommended as link:https://www.softwareheritage.org/faq/#34_Which_type_of_SWHID_should_I_use_in_my_articledocumentation[documented from its FAQ].
More specifically, the contextual information has to be as full as possible which you can easily get the identifier with all relevant qualifiers in its archive website interface which we'll cover next.
You can see more of them from <<appendix:guidelines-for-referencing-swhids>>.
[NOTE]
====
While the link text shown in the table are shown with the complete identifier with all qualifiers, it is recommended to show only the qualifier as the link text.
This is to address the obvious problem of length making it harder to read.
For a proper example of a hyperlink, here is one with swh:{swh-system76-firmware-license}[].
====
[chat, foodogsquared, state=curious]
====
So what happens when I give a qualifier with a wrong value such as the `origin` qualifier that points to an non-existent origin or an `anchor` qualifier that points to an invalid SWHID?
====
[chat, Ezran, role=reversed]
====
Why don't you try those out yourself?
Here's a list of them just for starters.
[.break-anywhere]
- swh:{swhid-content-with-invalid-origin}[opts=full]
- swh:{swhid-content-with-invalid-path}[opts=full]
You could also mix and match qualifiers that are not supposed to appear in certain object types such as the `lines` qualifier in non-content objects.
====
[#using-the-software-heritage-archive-website]
== Using the Software Heritage archive website
Throughout the Software Heritage ecosystem, there are tools that make use of the service.
Its main interface is on the link:https://archive.softwareheritage.org/[archive website interface] is what you're likely to use the most.
The workflow from the website interface is pretty simple: you search for the origin of the software, enter the corresponding object, and specify what you want to refer to. footnote:[Y'know, identifying and referring parts of software as this point is already hammered multiple times by this point. :)]
The most important thing to note with this website is using it as a resolver for SWHIDs that is similarly used with link:https://dx.doi.org/[DOIs].
You've already seen its usage with the links from the previous tables such as in <<tbl:swhid-list>> and <<tbl:swhid-with-qualifiers>>.
Using it as a resolver is simple: just append the identifier on the root endpoint of the service.
[literal]
----
https://archive.softwarearchive.org/$SWHID
----
With the user-facing side of the website, what you'll see first is a search interface.
Take note the quality of the search results is not perfect nor usable if you're not aware of the quirks of its search engine.
For example, merely entering the name of the software is not typically enough for searching.
[#fig:website-search-result-linux-kernel-query]
.The search result for the query "linux kernel"
image::./assets/archive-website-interface-linux-kernel-query.png[The search result from the query "linux kernel" from the Software Heritage archive website]
Even searching with metadata doesn't help.
[#fig:website-search-result-linux-kernel-query-with-metadata]
.The search result for the query "linux kernel" with metadata
image::./assets/archive-website-interface-linux-kernel-query-with-metadata.png[The search result from the query "linux kernel" including its metadata in the Software Heritage archive website]
It's pretty obvious that it doesn't have enough quality results.
Instead, I recommend to enter the origin URL that you're searching for (e.g., `https://github.com/torvalds/linux`).
If there is an exact match of the given origin, the website will directly go to the page of the software artifact with that origin.
This is especially nice for the sources it already monitors such as GitHub, GitLab instances, and Gitea instances.
This even works for package indices such as Pypi and npm (e.g., `https://pypi.org/project/swh.core`, `https://www.npmjs.com/package/vue`).
For more details, there is link:https://archive.softwareheritage.org/coverage/[a dedicated page on what sources are being monitored] which you can infer what URLs can be resolved in this way.
Once you get into the software artifact of your choosing (e.g., directory, file, revision, snapshot), you can get the identifier with the permalink tab on the side of the website.
[#vid:archive-website-using-permalink-tab]
.Using the permalink tab on the website
video::./assets/archive-website-using-permalink-tab.webm[]
[#other-software-heritage-tools]
== Other Software Heritage tools
Other than the link:https://archive.softwareheritage.org/[website], there are tools available to easily make use of the service.
The ecosystem of Software Heritage is somewhat limiting as Software Heritage itself is relatively young but it does have nice tools to begin with.
Let's take a closer look at them.
- link:https://docs.softwareheritage.org/devel/swh-model/cli.html[`swh identify`] is a command-line interface that prints the SWHID of the given objects.
SWHID are computationally generated that can be done locally which is nice if you have the codebase on disk and want to refer to them through the archive.
- A nice way to explore the archive is with link:https://docs.softwareheritage.org/devel/swh-fuse/index.html[Software Heritage Filesystem] (SwhFS) which comes with a command-line interface (`swh fs`).
This tool alongside `swh identify` is one way to explore the archive entirely on the terminal.
- link:https://docs.softwareheritage.org/devel/swh-web-client/[A web client for SWH in Python] which is nice if you're using Python in the first place.
- link:https://www.softwareheritage.org/browser-extensions/[Some SWH-related browser extensions.] footnote:[As of 2023-05-09, only one is made public so far.]
Among them is the link:https://www.softwareheritage.org/browser-extensions/#UpdateSWH[UpdateSWH] which checks and includes the archival of a repository in the queue, all in a simple interface.
- For those who are writing with LaTeX, ctan:biblatex-software[there is a package for adding software entry types in BibLaTeX].
Furthermore, there are initiatives to integrate it with projects such as with link:https://www.softwareheritage.org/2019/04/18/software-heritage-and-gnu-guix-join-forces-to-enable-long-term-reproducibility/[Guix] and link:https://nlnet.nl/project/SoftwareHeritage-P2P/[peer-to-peer access with IPFS].
[#sidebar:swh-tools-wishlist]
.SWH tools wishlist
****
As the ecosystem around Software Heritage is young, there are some tools and services that could use and integrate with the service.
The following list is what I would like to see.
- More integration with software forges.
Though this could be implemented with browser extensions, it would be nicer if forges such as GitHub and Gitea can integrate the service even if it through extensions.
GitHub already has some foundations with this feature as link:https://github.blog/2021-08-19-enhanced-support-citations-github/[it has citation support].
- link:https://www.zotero.org/[Zotero] integration with the service.
You could go into the archive and quickly get the reference just as you would on link:https://arxiv.org[arXiv].
****
[#appendix:guidelines-for-referencing-swhids]
[appendix]
== Guidelines for referencing SWHIDs
While using SWHIDs is a done-and-forget procedure (for the most part), there is a set of guidelines to make usage of them a bit easier.
- Per the link:https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#choosing-what-type-of-swhid-to-use[documentation], it is recommended to use `swh:dir:` SWHIDs over `swh:rev:` or `swh:rel:` since `swh:dir:` can be computed without relying on the Software Heritage archive.
The revision and release identifiers are mostly used as part of the metadata such as the one example from <<tbl:swhid-with-qualifiers>>.
- As already mentioned, SWHIDs with full contextual qualifiers are recommended.
This should be easy to retrieve considering the website interface gets them for you as seen from <<vid:archive-website-using-permalink-tab, this video>>.
- If you want to create a hyperlink, it is advisable to make the core identifier as the link text to address the obvious problem of length making it harder to read (case in point, <<tbl:swhid-with-qualifiers, in this table>>).
For a proper example of a hyperlink, here is one with swh:{swh-system76-firmware-license}[].
[#appendix:extending-asciidoctor-for-linking-swhids]
[appendix]
== Extending Asciidoctor for linking SWHIDs
Linking SWHIDs could be tedious when writing documents.
In link:https://asciidoctor.org[Asciidoctor], there are features where this makes it easier.
Specifically, we're talking about storing the identifiers in link:https://docs.asciidoctor.org/asciidoc/latest/attributes/document-attributes/[document attributes].
.Using attributes for storing and linking SWHIDs
[source, asciidoc]
----
:swh-system76-firmware-license-core-identifier: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2
:swh-system76-firmware-license: swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2;origin=https://github.com/pop-os/system76-firmware;lines=471-538
link:https://archive.softwarearchive.org/{swh-system76-firmware-license}[{swh-system76-firmware-license-core-identifier}]
----
In my opinion, this is still tedious since we have to store two attributes that would need separate changes where it should be only one.
Fortunately, Asciidoctor can be extended to introduce new syntax which xref:{asciidoc-go-template-hugo-featuring-nix-post}[I've previously shown how Asciidoctor can be extended].
We can apply a similar solution here.
[NOTE]
====
This is the very solution used for linking SWHIDs in github:foo-dogsquared/website[my website].
====
For our initial version of the new syntax, it looks like the following.
[source, asciidoc]
----
swh:$SWHID[$CAPTION]
----
It is an link:https://docs.asciidoctor.org/asciidoctor/latest/extensions/inline-macro-processor/[inline macro] that accepts an SWHID and can accept a caption as the link text.
Take note the caption is optional with the core identifier being the default caption.
The following listing should show a complete list of use cases we considered for this macro.
.`sample.adoc`
[source, asciidoc]
----
include::git:{doccontentref}[path=swhid-sample.adoc]
----
The inline macro should produce a link target to the default SWHID resolver at `https://archive.softwareheritage.org`.
Anyways, here's the code for the `swh` Asciidoctor extension.
.`lib/asciidoctor/swhid-inline-macro/extension.rb`
[source, ruby]
----
include::git:{doccontentref}[path=lib/asciidoctor/custom_extensions/swhid_link_inline_macro.rb]
----
[NOTE]
====
As an exercise, you could add an option to replace the resolver domain with the `resolver` attribute.
Furthermore, you could add a procedure to check whether the given SWHID is valid or not.
====
You cannot make use of the extension as it is not registered within the Asciidoctor registry yet.
Let's make the file that does that.
.`lib/asciidoctor-custom-extensions.rb`
[source, ruby]
----
include::git:{doccontentref}[path=lib/asciidoctor-custom-extensions.rb]
----
Now with the extension in place, you can use it with Asciidoctor like with the following listing.
[source, shell]
----
asciidoctor -r ./lib/asciidoctor-custom-extensions.rb sample.adoc
----
Voila!
Now you have an nicer way of linking them SWHIDs with the archive.
This extension should be usable for all backends since it is a simple shorthand for linking SWHIDs to the archive.