diff --git a/README.md b/README.md index 95dd8963..61ce69b2 100644 --- a/README.md +++ b/README.md @@ -81,9 +81,9 @@ osv-detector --parse-as 'package-lock.json' path/to/my/file.lock ``` By default, the detector attempts to detect known vulnerabilities by checking -the versions of packages specified by the parsed lockfile against the versions -specified by the OSVs in the loaded OSV databases, using an internal -semver-based package that aims to minimize false negatives (see +the versions of packages specified by the OVSs in the loaded OSV databases, +comparing based on the version ordering rules for the specific ecosystem being +checked as closely as possible (see [this section](#version-parsing-and-comparing) for more details about version handling). @@ -335,7 +335,7 @@ the detector doesn't know about, such as `NuGet`. You can either pass in CSV rows: ``` -osv-detector --parse-as csv-row 'npm,@typescript-eslint/types,5.13.0' 'Packagist,sentry/sdk,2.0.4' +osv-detector --parse-as csv-row 'npm,,@typescript-eslint/types,5.13.0' 'Packagist,sentry/sdk,2.0.4' ``` or you can specify paths to csv files: @@ -344,17 +344,35 @@ or you can specify paths to csv files: osv-detector --parse-as csv-file path/to/my/first-csv path/to/my/second-csv ``` -Each CSV row must have at least three fields which hold the ecosystem, package -name, and version (or commit) respectively, and CSV files cannot contain a -header. +Each CSV row represents a package and is made up of at least four fields: + +1. The ecosystem that the package is from, which is used as part of identifying + if an OSV is about the given package + - This does not have to be one of the ecosystems referenced in the detector, + or in the OSV specification + - This should be omitted if you are wanting to compare a commit using an API + database +2. The ecosystem whose version comparison semantics to use when determining if + an OSV applies to the given package + - This has to be an ecosystem for which the detector supports comparing + versions of; this field can be blank if the first field refers to an + ecosystem the detector supports comparing, otherwise it should be the + ecosystem whose version semantics most closely match that of your arbitrary + ecosystem + - This should be omitted if you are wanting to compare a commit using an API + database +3. The name of the package +4. The version of the package, or the SHA of a `git` commit + - If you are providing a commit, then you must leave the first two fields + empty and ensure an API-based database is loaded i.e. via `--use-api` + +> **Warning** +> +> Do not include a header if you are using a CSV file The `ecosystem` does _not_ have to be one listed by the detector as known, meaning you can use any ecosystem that [osv.dev](https://osv.dev/) provides. -If the ecosystem field is empty, then the `version` field is expected to be a -commit. In this case, the `package` column is decorative as only the commit is -passed to the API. - > Remember to tell the detector to use the `osv.dev` API via the `--use-api` > flag if you're wanting to check commits! @@ -362,7 +380,7 @@ You can also omit the version to have the detector list all known vulnerabilities in the loaded database that apply to the given package: ``` -osv-detector --parse-as csv-row 'NuGet,Yarp.ReverseProxy,' +osv-detector --parse-as csv-row 'NuGet,,Yarp.ReverseProxy,' ``` While this uses the `--parse-as` flag, these are _not_ considered standard @@ -417,8 +435,10 @@ The following packages were found in /path/to/my/Gemfile.lock: ## Version parsing and comparing -Versions are compared using an internal `semver` package which aims to support -any number of components followed by a build string. +Versions are compared using an internal `semantic` package which aims to support +compare versions accurately per the version semantics of each ecosystem, falling +back to a relaxed version of SemVer that supports unlimited number components +followed by a build string. Components are numbers broken up by dots, e.g. `1.2.3` has the components `1, 2, 3`. Anything that is not a number or a dot is considered to be the start diff --git a/fixtures/csvs-files/two-rows.csv b/fixtures/csvs-files/two-rows.csv index ee0bb135..42c07bbf 100644 --- a/fixtures/csvs-files/two-rows.csv +++ b/fixtures/csvs-files/two-rows.csv @@ -1,2 +1,2 @@ -NuGet,Yarp.ReverseProxy, -npm,@typescript-eslint/types,5.13.0 +NuGet,,Yarp.ReverseProxy, +npm,,@typescript-eslint/types,5.13.0 diff --git a/generators/GenerateMavenVersions.java b/generators/GenerateMavenVersions.java new file mode 100644 index 00000000..7c8f6ba2 --- /dev/null +++ b/generators/GenerateMavenVersions.java @@ -0,0 +1,209 @@ +import org.apache.maven.artifact.versioning.ComparableVersion; + +import org.json.JSONArray; +import org.json.JSONObject; + +import java.io.*; +import java.net.URL; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +/** + * Script for generating a list of maven version comparison fixtures based off + * every version mentioned in the OSV Maven database, sorted using the native + * Maven implementation. + *
+ * To run this, you need to ensure copies of the following libraries are present + * on the class path: + * + *
json
maven-artifact
lib
subfolder and then running:
+ *
+ * java -cp generators/lib/* generators/GenerateMavenVersions.java
+ *
+ */
+public class GenerateMavenVersions {
+ public static String downloadMavenDb() throws IOException {
+ URL website = new URL("https://osv-vulnerabilities.storage.googleapis.com/Maven/all.zip");
+ String file = "./maven-db.zip";
+
+ ReadableByteChannel rbc = Channels.newChannel(website.openStream());
+
+ try(FileOutputStream fos = new FileOutputStream(file)) {
+ fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
+ }
+
+ return file;
+ }
+
+ public static Map[-_\.]?(?P(a|b|c|rc|alpha|beta|pre|preview))[-_\.]?(?P [0-9]+)?)?(?P (?:-(?P [0-9]+))|(?:[-_\.]?(?P post|rev|r)[-_\.]?(?P [0-9]+)?))?(?P [-_\.]?(?P dev)[-_\.]?(?P [0-9]+)?)?)(?:\+(?P [a-z0-9]+(?:[-_\.][a-z0-9]+)*))?\s*$`) + match := re.FindStringSubmatch(str) + + if len(match) == 0 { + return parsePyPILegacyVersion(str) + } + + var version PyPIVersion + + version.epoch = big.NewInt(0) + + if epoch := match[re.SubexpIndex("epoch")]; epoch != "" { + version.epoch = convertToBigIntOrPanic(epoch) + } + + for _, r := range strings.Split(match[re.SubexpIndex("release")], ".") { + version.release = append(version.release, convertToBigIntOrPanic(r)) + } + + version.pre = parseLetterVersion(match[re.SubexpIndex("pre_l")], match[re.SubexpIndex("pre_n")]) + + post := match[re.SubexpIndex("post_n1")] + + if post == "" { + post = match[re.SubexpIndex("post_n2")] + } + + version.post = parseLetterVersion(match[re.SubexpIndex("post_l")], post) + version.dev = parseLetterVersion(match[re.SubexpIndex("dev_l")], match[re.SubexpIndex("dev_n")]) + version.local = parseLocalVersion(match[re.SubexpIndex("local")]) + + return version +} + +// Compares the epoch segments of each version +func (pv PyPIVersion) compareEpoch(pw PyPIVersion) int { + return pv.epoch.Cmp(pw.epoch) +} + +// Compares the release segments of each version, which considers the numeric value +// of each component in turn; when comparing release segments with different numbers +// of components, the shorter segment is padded out with additional zeros as necessary. +func (pv PyPIVersion) compareRelease(pw PyPIVersion) int { + return pv.release.Cmp(pw.release) +} + +func (pv PyPIVersion) preIndex() int { + for i, pre := range []string{"a", "b", "rc"} { + if pre == pv.pre.letter { + return i + } + } + + panic(fmt.Sprintf("unknown prefix %s", pv.pre.letter)) +} + +// Checks if this PyPIVersion should apply a sort trick when comparing pre, +// which ensures that i.e. 1.0.dev0 is before 1.0a0. +func (pv PyPIVersion) shouldApplyPreTrick() bool { + return pv.pre.number == nil && pv.post.number == nil && pv.dev.number != nil +} + +// Compares the pre-release segment of each version, which consist of an alphabetical +// identifier for the pre-release phase, along with a non-negative integer value. +// +// Pre-releases for a given release are ordered first by phase (alpha, beta, release +// candidate) and then by the numerical component within that phase. +// +// Versions without a pre-release are sorted after those with one. +func (pv PyPIVersion) comparePre(pw PyPIVersion) int { + switch { + case pv.shouldApplyPreTrick() && pw.shouldApplyPreTrick(): + return +0 + case pv.shouldApplyPreTrick(): + return -1 + case pw.shouldApplyPreTrick(): + return +1 + case pv.pre.number == nil && pw.pre.number == nil: + return +0 + case pv.pre.number == nil: + return +1 + case pw.pre.number == nil: + return -1 + default: + ai := pv.preIndex() + bi := pw.preIndex() + + if ai == bi { + return pv.pre.number.Cmp(pw.pre.number) + } + + if ai > bi { + return +1 + } + if ai < bi { + return -1 + } + + return 0 + } +} + +// Compares the post-release segment of each version. +// +// Post-releases are ordered by their numerical component, immediately following +// the corresponding release, and ahead of any subsequent release. +// +// Versions without a post segment are sorted before those with one. +func (pv PyPIVersion) comparePost(pw PyPIVersion) int { + switch { + case pv.post.number == nil && pw.post.number == nil: + return +0 + case pv.post.number == nil: + return -1 + case pw.post.number == nil: + return +1 + default: + return pv.post.number.Cmp(pw.post.number) + } +} + +// Compares the dev-release segment of each version, which consists of the string +// ".dev" followed by a non-negative integer value. +// +// Developmental releases are ordered by their numerical component, immediately +// before the corresponding release (and before any pre-releases with the same release segment), +// and following any previous release (including any post-releases). +// +// Versions without a development segment are sorted after those with one. +func (pv PyPIVersion) compareDev(pw PyPIVersion) int { + switch { + case pv.dev.number == nil && pw.dev.number == nil: + return +0 + case pv.dev.number == nil: + return +1 + case pw.dev.number == nil: + return -1 + default: + return pv.dev.number.Cmp(pw.dev.number) + } +} + +// Compares the local segment of each version +func (pv PyPIVersion) compareLocal(pw PyPIVersion) int { + min := minInt(len(pv.local), len(pw.local)) + + var compare int + + for i := 0; i < min; i++ { + ai, aIsNumber := convertToBigInt(pv.local[i]) + bi, bIsNumber := convertToBigInt(pw.local[i]) + + switch { + // If a segment consists entirely of ASCII digits then that section should be considered an integer for comparison purposes + case aIsNumber && bIsNumber: + compare = ai.Cmp(bi) + // If a segment contains any ASCII letters then that segment is compared lexicographically with case insensitivity. + case !aIsNumber && !bIsNumber: + compare = strings.Compare(pv.local[i], pw.local[i]) + // When comparing a numeric and lexicographic segment, the numeric section always compares as greater than the lexicographic segment. + case aIsNumber: + compare = +1 + default: + compare = -1 + } + + if compare != 0 { + if compare > 0 { + return 1 + } + + return -1 + } + } + + // Additionally a local version with a great number of segments will always compare as greater than a local version with fewer segments, + // as long as the shorter local version’s segments match the beginning of the longer local version’s segments exactly. + if len(pv.local) > len(pw.local) { + return +1 + } + if len(pv.local) < len(pw.local) { + return -1 + } + + return 0 +} + +// Compares the legacy segment of each version. +// +// These are versions that predate and are incompatible with PEP 440 - comparing +// is "best effort" since there isn't a strong specification defined, and are +// always considered lower than PEP 440 versions to match current day tooling. +// +// http://peak.telecommunity.com/DevCenter/setuptools#specifying-your-project-s-version +// looks like a good reference, but unsure where it sits in the actual tooling history +func (pv PyPIVersion) compareLegacy(pw PyPIVersion) int { + if len(pv.legacy) == 0 && len(pw.legacy) == 0 { + return +0 + } + if len(pv.legacy) == 0 && len(pw.legacy) != 0 { + return +1 + } + if len(pv.legacy) != 0 && len(pw.legacy) == 0 { + return -1 + } + + return strings.Compare( + strings.Join(pv.legacy, ""), + strings.Join(pw.legacy, ""), + ) +} + +func pypiCompareVersion(v, w PyPIVersion) int { + if legacyDiff := v.compareLegacy(w); legacyDiff != 0 { + return legacyDiff + } + if epochDiff := v.compareEpoch(w); epochDiff != 0 { + return epochDiff + } + if releaseDiff := v.compareRelease(w); releaseDiff != 0 { + return releaseDiff + } + if preDiff := v.comparePre(w); preDiff != 0 { + return preDiff + } + if postDiff := v.comparePost(w); postDiff != 0 { + return postDiff + } + if devDiff := v.compareDev(w); devDiff != 0 { + return devDiff + } + if localDiff := v.compareLocal(w); localDiff != 0 { + return localDiff + } + + return 0 +} + +func (pv PyPIVersion) Compare(pw PyPIVersion) int { + return pypiCompareVersion(pv, pw) +} + +func (pv PyPIVersion) CompareStr(str string) int { + return pv.Compare(parsePyPIVersion(str)) +} diff --git a/pkg/semantic/version-rubygems.go b/pkg/semantic/version-rubygems.go new file mode 100644 index 00000000..66c6e87f --- /dev/null +++ b/pkg/semantic/version-rubygems.go @@ -0,0 +1,148 @@ +package semantic + +import ( + "strconv" + "strings" +) + +func canonicalizeRubyGemVersion(str string) string { + res := "" + + checkPrevious := false + previousWasDigit := true + + for _, c := range str { + if c == 46 { + checkPrevious = false + res += "." + + continue + } + + isDigit := c >= 48 && c <= 57 + + if checkPrevious && previousWasDigit != isDigit { + res += "." + } + + res += string(c) + + previousWasDigit = isDigit + checkPrevious = true + } + + return res +} + +func groupSegments(segs []string) (numbers []string, build []string) { + for _, seg := range segs { + _, isNumber := convertToBigInt(seg) + + if len(build) > 0 || !isNumber { + build = append(build, seg) + + continue + } + + numbers = append(numbers, seg) + } + + return numbers, build +} + +func removeZeros(segs []string) []string { + i := len(segs) - 1 + + for i >= 0 { + if segs[i] != "0" { + i++ + + break + } + + i-- + } + + return segs[:maxInt(i, 0)] +} + +func canonicalSegments(segs []string) (canSegs []string) { + numbers, build := groupSegments(segs) + + return append(removeZeros(numbers), removeZeros(build)...) +} + +func compareRubyGemsComponents(a, b []string) int { + max := maxInt(len(a), len(b)) + + var compare int + + for i := 0; i < max; i++ { + as := fetch(a, i, "0") + bs := fetch(b, i, "0") + + ai, aIsNumber := convertToBigInt(as) + bi, bIsNumber := convertToBigInt(bs) + + switch { + case aIsNumber && bIsNumber: + compare = ai.Cmp(bi) + case !aIsNumber && !bIsNumber: + compare = strings.Compare(as, bs) + case aIsNumber: + compare = +1 + default: + compare = -1 + } + + if compare != 0 { + if compare > 0 { + return 1 + } + + return -1 + } + } + + if len(a) > len(b) { + next := a[len(b)] + + if _, err := strconv.Atoi(next); err == nil { + return 1 + } + + return -1 + } + + if len(a) < len(b) { + next := b[len(a)] + + if _, err := strconv.Atoi(next); err == nil { + return -1 + } + + return +1 + } + + return 0 +} + +type RubyGemsVersion struct { + Original string + Segments []string +} + +func parseRubyGemsVersion(str string) RubyGemsVersion { + return RubyGemsVersion{ + str, + canonicalSegments(strings.Split(canonicalizeRubyGemVersion(str), ".")), + } +} + +func (v RubyGemsVersion) Compare(w RubyGemsVersion) int { + return compareRubyGemsComponents(v.Segments, w.Segments) +} + +func (v RubyGemsVersion) CompareStr(str string) int { + return v.Compare(parseRubyGemsVersion(str)) +} diff --git a/pkg/semantic/version-semver-like.go b/pkg/semantic/version-semver-like.go new file mode 100644 index 00000000..d9480192 --- /dev/null +++ b/pkg/semantic/version-semver-like.go @@ -0,0 +1,141 @@ +package semantic + +import ( + "fmt" + "math/big" + "regexp" + "strings" +) + +// SemverLikeVersion is a version that is _like_ a version as defined by the +// Semantic Version specification, except with potentially unlimited numeric +// components and a leading "v" +type SemverLikeVersion struct { + LeadingV bool + Components Components + Build string + Original string +} + +func (v *SemverLikeVersion) fetchComponentsAndBuild(maxComponents int) (Components, string) { + if len(v.Components) <= maxComponents { + return v.Components, v.Build + } + + comps := v.Components[:maxComponents] + extra := v.Components[maxComponents:] + + build := v.Build + + for _, c := range extra { + build += fmt.Sprintf(".%d", c) + } + + return comps, build +} + +func ParseSemverLikeVersion(line string, maxComponents int) SemverLikeVersion { + v := parseSemverLike(line) + + if maxComponents == -1 { + return v + } + + components, build := v.fetchComponentsAndBuild(maxComponents) + + return SemverLikeVersion{ + LeadingV: v.LeadingV, + Components: components, + Build: build, + Original: v.Original, + } +} + +func parseSemverLike(line string) SemverLikeVersion { + var components []*big.Int + originStr := line + + numberReg := regexp.MustCompile(`\d`) + + currentCom := "" + foundBuild := false + emptyComponent := false + + leadingV := strings.HasPrefix(line, "v") + line = strings.TrimPrefix(line, "v") + + for _, c := range line { + if foundBuild { + currentCom += string(c) + + continue + } + + // this is part of a component version + if numberReg.MatchString(string(c)) { + currentCom += string(c) + + continue + } + + // at this point, we: + // 1. might be parsing a component (as foundBuild != true) + // 2. we're not looking at a part of a component (as c != number) + // + // so c must be either: + // 1. a component terminator (.), or + // 2. the start of the build string + // + // either way, we will be terminating the current component being + // parsed (if any), so let's do that first + if currentCom != "" { + v, _ := new(big.Int).SetString(currentCom, 10) + + components = append(components, v) + currentCom = "" + + emptyComponent = false + } + + // a component terminator means there might be another component + // afterwards, so don't start parsing the build string just yet + if c == '.' { + emptyComponent = true + + continue + } + + // anything else is part of the build string + foundBuild = true + currentCom = string(c) + } + + // if we looped over everything without finding a build string, + // then what we were currently parsing is actually a component + if !foundBuild && currentCom != "" { + v, _ := new(big.Int).SetString(currentCom, 10) + + components = append(components, v) + currentCom = "" + emptyComponent = false + } + + // if we ended with an empty component section, + // prefix the build string with a '.' + if emptyComponent { + currentCom = "." + currentCom + } + + // if we found no components, then the v wasn't actually leading + if len(components) == 0 && leadingV { + leadingV = false + currentCom = "v" + currentCom + } + + return SemverLikeVersion{ + LeadingV: leadingV, + Components: components, + Build: currentCom, + Original: originStr, + } +} diff --git a/pkg/semantic/version-semver.go b/pkg/semantic/version-semver.go new file mode 100644 index 00000000..bd198405 --- /dev/null +++ b/pkg/semantic/version-semver.go @@ -0,0 +1,103 @@ +package semantic + +import ( + "strings" +) + +// Removes build metadata from the given string if present, per semver v2 +// +// See https://semver.org/spec/v2.0.0.html#spec-item-10 +func removeBuildMetadata(str string) string { + parts := strings.Split(str, "+") + + return parts[0] +} + +func compareBuildComponents(a, b string) int { + // https://semver.org/spec/v2.0.0.html#spec-item-10 + a = removeBuildMetadata(a) + b = removeBuildMetadata(b) + + // the spec doesn't explicitly say "don't include the hyphen in the compare" + // but it's what node-semver does so for now let's go with that... + a = strings.TrimPrefix(a, "-") + b = strings.TrimPrefix(b, "-") + + // versions with a prerelease are considered less than those without + // https://semver.org/spec/v2.0.0.html#spec-item-9 + if a == "" && b != "" { + return +1 + } + if a != "" && b == "" { + return -1 + } + + return compareSemverBuildComponents( + strings.Split(a, "."), + strings.Split(b, "."), + ) +} + +func compareSemverBuildComponents(a, b []string) int { + min := minInt(len(a), len(b)) + + var compare int + + for i := 0; i < min; i++ { + ai, aIsNumber := convertToBigInt(a[i]) + bi, bIsNumber := convertToBigInt(b[i]) + + switch { + // 1. Identifiers consisting of only digits are compared numerically. + case aIsNumber && bIsNumber: + compare = ai.Cmp(bi) + // 2. Identifiers with letters or hyphens are compared lexically in ASCII sort order. + case !aIsNumber && !bIsNumber: + compare = strings.Compare(a[i], b[i]) + // 3. Numeric identifiers always have lower precedence than non-numeric identifiers. + case aIsNumber: + compare = -1 + default: + compare = +1 + } + + if compare != 0 { + if compare > 0 { + return 1 + } + + return -1 + } + } + + // 4. A larger set of pre-release fields has a higher precedence than a smaller set, + // if all the preceding identifiers are equal. + if len(a) > len(b) { + return +1 + } + if len(a) < len(b) { + return -1 + } + + return 0 +} + +type SemverVersion struct { + SemverLikeVersion +} + +func parseSemverVersion(str string) SemverVersion { + return SemverVersion{ParseSemverLikeVersion(str, 3)} +} + +func (v SemverVersion) Compare(w SemverVersion) int { + if diff := v.Components.Cmp(w.Components); diff != 0 { + return diff + } + + return compareBuildComponents(v.Build, w.Build) +} + +func (v SemverVersion) CompareStr(str string) int { + return v.Compare(parseSemverVersion(str)) +} diff --git a/pkg/semantic/version.go b/pkg/semantic/version.go index f25dba67..24dcdf86 100644 --- a/pkg/semantic/version.go +++ b/pkg/semantic/version.go @@ -1,19 +1,19 @@ package semantic import ( - "fmt" "math/big" - "strings" ) -type Components []*big.Int - -type Version struct { - LeadingV bool - Components Components - Build string +type Version interface { + // CompareStr returns an integer representing the sort order of the given string + // when parsed as the concrete Version relative to the subject Version. + // + // The result will be 0 if v == w, -1 if v < w, or +1 if v > w. + CompareStr(str string) int } +type Components []*big.Int + func (components *Components) Fetch(n int) *big.Int { if len(*components) <= n { return big.NewInt(0) @@ -22,19 +22,16 @@ func (components *Components) Fetch(n int) *big.Int { return (*components)[n] } -func (v *Version) String() string { - str := "" +func (components *Components) Cmp(b Components) int { + numberOfComponents := maxInt(len(*components), len(b)) - if v.LeadingV { - str += "v" - } + for i := 0; i < numberOfComponents; i++ { + diff := components.Fetch(i).Cmp(b.Fetch(i)) - for _, component := range v.Components { - str += fmt.Sprintf("%d.", component) + if diff != 0 { + return diff + } } - str = strings.TrimSuffix(str, ".") - str += v.Build - - return str + return 0 }