diff --git a/.github/actions/veristat_baseline_compare/action.yml b/.github/actions/veristat_baseline_compare/action.yml new file mode 100644 index 0000000000000..9e25a2b0bf1ca --- /dev/null +++ b/.github/actions/veristat_baseline_compare/action.yml @@ -0,0 +1,49 @@ +name: 'run-veristat' +description: 'Run veristat benchmark' +inputs: + veristat_output: + description: 'Veristat output filepath' + required: true + baseline_name: + description: 'Veristat baseline cache name' + required: true +runs: + using: "composite" + steps: + - uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.baseline_name }} + if-no-files-found: error + path: ${{ github.workspace }}/${{ inputs.veristat_output }} + + # For pull request: + # - get baseline log from cache + # - compare it to current run + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/cache/restore@v4 + with: + key: ${{ inputs.baseline_name }} + restore-keys: | + ${{ inputs.baseline_name }}- + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' + + - if: ${{ github.event_name == 'pull_request' }} + name: Show veristat comparison + shell: bash + run: ./.github/scripts/compare-veristat-results.sh + env: + BASELINE_PATH: ${{ github.workspace }}/${{ inputs.baseline_name }} + VERISTAT_OUTPUT: ${{ inputs.veristat_output }} + + # For push: just put baseline log to cache + - if: ${{ github.event_name == 'push' }} + shell: bash + run: | + mv "${{ github.workspace }}/${{ inputs.veristat_output }}" \ + "${{ github.workspace }}/${{ inputs.baseline_name }}" + + - if: ${{ github.event_name == 'push' }} + uses: actions/cache/save@v4 + with: + key: ${{ inputs.baseline_name }}-${{ github.run_id }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' diff --git a/.github/scripts/compare-veristat-results.sh b/.github/scripts/compare-veristat-results.sh new file mode 100755 index 0000000000000..f95c3c192d80d --- /dev/null +++ b/.github/scripts/compare-veristat-results.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ ! -f "${BASELINE_PATH}" ]]; then + echo "# No ${BASELINE_PATH} available" >> "${GITHUB_STEP_SUMMARY}" + + echo "No ${BASELINE_PATH} available" + echo "Printing veristat results" + cat "${VERISTAT_OUTPUT}" + + exit +fi + +selftests/bpf/veristat \ + --output-format csv \ + --emit file,prog,verdict,states \ + --compare "${BASELINE_PATH}" "${VERISTAT_OUTPUT}" > compare.csv + +python3 ./.github/scripts/veristat_compare.py compare.csv diff --git a/.github/scripts/download-gcc-bpf.sh b/.github/scripts/download-gcc-bpf.sh new file mode 100755 index 0000000000000..894584a01b2ec --- /dev/null +++ b/.github/scripts/download-gcc-bpf.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +GCC_BPF_RELEASE_GH_REPO=$1 +INSTALL_DIR=$(realpath $2) + +cd /tmp + +tag=$(gh release list -L 1 -R ${GCC_BPF_RELEASE_GH_REPO} --json tagName -q .[].tagName) +if [[ -z "$tag" ]]; then + echo "Could not find latest GCC BPF release at ${GCC_BPF_RELEASE_GH_REPO}" + exit 1 +fi + +url="https://github.com/${GCC_BPF_RELEASE_GH_REPO}/releases/download/${tag}/${tag}.tar.zst" +echo "Downloading $url" +wget -q "$url" + +tarball=${tag}.tar.zst +dir=$(tar tf $tarball | head -1 || true) + +echo "Extracting $tarball ..." +tar -I zstd -xf $tarball && rm -f $tarball + +rm -rf $INSTALL_DIR +mv -v $dir $INSTALL_DIR + +cd - + diff --git a/.github/scripts/matrix.py b/.github/scripts/matrix.py new file mode 100644 index 0000000000000..9f4f5bbe23a15 --- /dev/null +++ b/.github/scripts/matrix.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import os +import dataclasses +import json + +from enum import Enum +from typing import Any, Dict, List, Final, Set, Union + +MANAGED_OWNER: Final[str] = "kernel-patches" +MANAGED_REPOS: Final[Set[str]] = { + f"{MANAGED_OWNER}/bpf", + f"{MANAGED_OWNER}/vmtest", +} + +DEFAULT_SELF_HOSTED_RUNNER_TAGS: Final[List[str]] = ["self-hosted", "docker-noble-main"] +DEFAULT_RUNNER: Final[str] = "ubuntu-24.04" +DEFAULT_LLVM_VERSION: Final[int] = 17 + + +class Arch(str, Enum): + """ + CPU architecture supported by CI. + """ + + AARCH64 = "aarch64" + S390X = "s390x" + X86_64 = "x86_64" + + +class Compiler(str, Enum): + GCC = "gcc" + LLVM = "llvm" + + +@dataclasses.dataclass +class Toolchain: + compiler: Compiler + # This is relevant ONLY for LLVM and should not be required for GCC + version: int + + @property + def short_name(self) -> str: + return str(self.compiler.value) + + @property + def full_name(self) -> str: + if self.compiler == Compiler.GCC: + return self.short_name + + return f"{self.short_name}-{self.version}" + + def to_dict(self) -> Dict[str, Union[str, int]]: + return { + "name": self.short_name, + "fullname": self.full_name, + "version": self.version, + } + + +@dataclasses.dataclass +class BuildConfig: + arch: Arch + toolchain: Toolchain + kernel: str = "LATEST" + run_veristat: bool = False + parallel_tests: bool = False + build_release: bool = False + + @property + def runs_on(self) -> List[str]: + if is_managed_repo(): + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [self.arch.value] + else: + return [DEFAULT_RUNNER] + + @property + def build_runs_on(self) -> List[str]: + if is_managed_repo(): + return ["codebuild"] + else: + return [DEFAULT_RUNNER] + + @property + def tests(self) -> Dict[str, Any]: + tests_list = [ + "test_progs", + "test_progs_parallel", + "test_progs_no_alu32", + "test_progs_no_alu32_parallel", + "test_verifier", + ] + + if self.arch.value != "s390x": + tests_list.append("test_maps") + + if self.toolchain.version >= 18: + tests_list.append("test_progs_cpuv4") + + # if self.arch in [Arch.X86_64, Arch.AARCH64]: + # tests_list.append("sched_ext") + + # Don't run GCC BPF runner, because too many tests are failing + # See: https://lore.kernel.org/bpf/87bjw6qpje.fsf@oracle.com/ + # if self.arch == Arch.X86_64: + # tests_list.append("test_progs-bpf_gcc") + + if not self.parallel_tests: + tests_list = [test for test in tests_list if not test.endswith("parallel")] + + return {"include": [generate_test_config(test) for test in tests_list]} + + def to_dict(self) -> Dict[str, Any]: + return { + "arch": self.arch.value, + "toolchain": self.toolchain.to_dict(), + "kernel": self.kernel, + "run_veristat": self.run_veristat, + "parallel_tests": self.parallel_tests, + "build_release": self.build_release, + "runs_on": self.runs_on, + "tests": self.tests, + "build_runs_on": self.build_runs_on, + } + + +def is_managed_repo() -> bool: + return ( + os.environ["GITHUB_REPOSITORY_OWNER"] == MANAGED_OWNER + and os.environ["GITHUB_REPOSITORY"] in MANAGED_REPOS + ) + + +def set_output(name, value): + """Write an output variable to the GitHub output file.""" + with open(os.getenv("GITHUB_OUTPUT"), "a", encoding="utf-8") as file: + file.write(f"{name}={value}\n") + + +def generate_test_config(test: str) -> Dict[str, Union[str, int]]: + """Create the configuration for the provided test.""" + is_parallel = test.endswith("_parallel") + config = { + "test": test, + "continue_on_error": is_parallel, + # While in experimental mode, parallel jobs may get stuck + # anywhere, including in user space where the kernel won't detect + # a problem and panic. We add a second layer of (smaller) timeouts + # here such that if we get stuck in a parallel run, we hit this + # timeout and fail without affecting the overall job success (as + # would be the case if we hit the job-wide timeout). For + # non-experimental jobs, 360 is the default which will be + # superseded by the overall workflow timeout (but we need to + # specify something). + "timeout_minutes": 30 if is_parallel else 360, + } + return config + + +if __name__ == "__main__": + matrix = [ + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + run_veristat=True, + parallel_tests=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=DEFAULT_LLVM_VERSION), + build_release=True, + ), + BuildConfig( + arch=Arch.X86_64, + toolchain=Toolchain(compiler=Compiler.LLVM, version=18), + build_release=True, + ), + BuildConfig( + arch=Arch.AARCH64, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + BuildConfig( + arch=Arch.S390X, + toolchain=Toolchain(compiler=Compiler.GCC, version=DEFAULT_LLVM_VERSION), + ), + ] + + # Outside of managed repositories only run on x86_64 + if not is_managed_repo(): + matrix = [config for config in matrix if config.arch == Arch.X86_64] + + json_matrix = json.dumps({"include": [config.to_dict() for config in matrix]}) + print(json.dumps(json.loads(json_matrix), indent=4)) + set_output("build_matrix", json_matrix) diff --git a/.github/scripts/tests/test_veristat_compare.py b/.github/scripts/tests/test_veristat_compare.py new file mode 100644 index 0000000000000..b65b69295235d --- /dev/null +++ b/.github/scripts/tests/test_veristat_compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import unittest +from typing import Iterable, List + +from ..veristat_compare import parse_table, VeristatFields + + +def gen_csv_table(records: Iterable[str]) -> List[str]: + return [ + ",".join(VeristatFields.headers()), + *records, + ] + + +class TestVeristatCompare(unittest.TestCase): + def test_parse_table_ignore_new_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,N/A,success,N/A,N/A,1,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_ignore_removed_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,N/A,N/A,1,N/A,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_new_failure(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,failure,MISMATCH,1,1,+0 (+0.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [["prog_file.bpf.o", "prog_name", "success -> failure (!!)", "+0.00 %"]], + ) + self.assertTrue(veristat_info.changes) + self.assertTrue(veristat_info.new_failures) + + def test_parse_table_new_changes(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,failure,success,MISMATCH,0,0,+0 (+0.00%)", + "prog_file.bpf.o,prog_name_increase,failure,failure,MATCH,1,2,+1 (+100.00%)", + "prog_file.bpf.o,prog_name_decrease,success,success,MATCH,1,1,-1 (-100.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [ + ["prog_file.bpf.o", "prog_name", "failure -> success", "+0.00 %"], + ["prog_file.bpf.o", "prog_name_increase", "failure", "+100.00 %"], + ["prog_file.bpf.o", "prog_name_decrease", "success", "-100.00 %"], + ], + ) + self.assertTrue(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/tmpfsify-workspace.sh b/.github/scripts/tmpfsify-workspace.sh new file mode 100755 index 0000000000000..6fd62b4ad2a49 --- /dev/null +++ b/.github/scripts/tmpfsify-workspace.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x -euo pipefail + +TMPFS_SIZE=20 # GB +MEM_TOTAL=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo) + +# sanity check: total mem is at least double TMPFS_SIZE +if [ $MEM_TOTAL -lt $(($TMPFS_SIZE*1024*2)) ]; then + echo "tmpfsify-workspace.sh: will not allocate tmpfs, total memory is too low (${MEM_TOTAL}MB)" + exit 0 +fi + +dir="$(basename "$GITHUB_WORKSPACE")" +cd "$(dirname "$GITHUB_WORKSPACE")" +mv "${dir}" "${dir}.backup" +mkdir "${dir}" +sudo mount -t tmpfs -o size=${TMPFS_SIZE}G tmpfs "${dir}" +rsync -a "${dir}.backup/" "${dir}" +cd - + diff --git a/.github/scripts/veristat_compare.py b/.github/scripts/veristat_compare.py new file mode 100644 index 0000000000000..07271b8cbd3aa --- /dev/null +++ b/.github/scripts/veristat_compare.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +# This script reads a CSV file produced by the following invocation: +# +# veristat --emit file,prog,verdict,states \ +# --output-format csv \ +# --compare ... +# +# And produces a markdown summary for the file. +# The summary is printed to standard output and appended to a file +# pointed to by GITHUB_STEP_SUMMARY variable. +# +# Script exits with return code 1 if there are new failures in the +# veristat results. +# +# For testing purposes invoke as follows: +# +# GITHUB_STEP_SUMMARY=/dev/null python3 veristat-compare.py test.csv +# +# File format (columns): +# 0. file_name +# 1. prog_name +# 2. verdict_base +# 3. verdict_comp +# 4. verdict_diff +# 5. total_states_base +# 6. total_states_comp +# 7. total_states_diff +# +# Records sample: +# file-a,a,success,failure,MISMATCH,12,12,+0 (+0.00%) +# file-b,b,success,success,MATCH,67,67,+0 (+0.00%) +# +# For better readability suffixes '_OLD' and '_NEW' +# are used instead of '_base' and '_comp' for variable +# names etc. + +import io +import os +import sys +import re +import csv +import logging +import argparse +import enum +from dataclasses import dataclass +from typing import Dict, Iterable, List, Final + + +TRESHOLD_PCT: Final[int] = 0 + +SUMMARY_HEADERS = ["File", "Program", "Verdict", "States Diff (%)"] + +# expected format: +0 (+0.00%) / -0 (-0.00%) +TOTAL_STATES_DIFF_REGEX = ( + r"(?P[+-]\d+) \((?P[+-]\d+\.\d+)\%\)" +) + + +TEXT_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +{table} +""".strip() +) + +HTML_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +
+Click to expand + +{table} +
+""".strip() +) + +GITHUB_MARKUP_REPLACEMENTS: Final[Dict[str, str]] = { + "->": "→", + "(!!)": ":bangbang:", +} + +NEW_FAILURE_SUFFIX: Final[str] = "(!!)" + + +class VeristatFields(str, enum.Enum): + FILE_NAME = "file_name" + PROG_NAME = "prog_name" + VERDICT_OLD = "verdict_base" + VERDICT_NEW = "verdict_comp" + VERDICT_DIFF = "verdict_diff" + TOTAL_STATES_OLD = "total_states_base" + TOTAL_STATES_NEW = "total_states_comp" + TOTAL_STATES_DIFF = "total_states_diff" + + @classmethod + def headers(cls) -> List[str]: + return [ + cls.FILE_NAME, + cls.PROG_NAME, + cls.VERDICT_OLD, + cls.VERDICT_NEW, + cls.VERDICT_DIFF, + cls.TOTAL_STATES_OLD, + cls.TOTAL_STATES_NEW, + cls.TOTAL_STATES_DIFF, + ] + + +@dataclass +class VeristatInfo: + table: list + changes: bool + new_failures: bool + + def get_results_title(self) -> str: + if self.new_failures: + return "There are new veristat failures" + + if self.changes: + return "There are changes in verification performance" + + return "No changes in verification performance" + + def get_results_summary(self, markup: bool = False) -> str: + title = self.get_results_title() + if not self.table: + return f"# {title}\n" + + template = TEXT_SUMMARY_TEMPLATE + table = format_table(headers=SUMMARY_HEADERS, rows=self.table) + + if markup: + template = HTML_SUMMARY_TEMPLATE + table = github_markup_decorate(table) + + return template.format(title=title, table=table) + + +def get_state_diff(value: str) -> float: + if value == "N/A": + return 0.0 + + matches = re.match(TOTAL_STATES_DIFF_REGEX, value) + if not matches: + raise ValueError(f"Failed to parse total states diff field value '{value}'") + + if percentage_diff := matches.group("percentage_diff"): + return float(percentage_diff) + + raise ValueError(f"Invalid {VeristatFields.TOTAL_STATES_DIFF} field value: {value}") + + +def parse_table(csv_file: Iterable[str]) -> VeristatInfo: + reader = csv.DictReader(csv_file) + assert reader.fieldnames == VeristatFields.headers() + + new_failures = False + changes = False + table = [] + + for record in reader: + add = False + + verdict_old, verdict_new = ( + record[VeristatFields.VERDICT_OLD], + record[VeristatFields.VERDICT_NEW], + ) + + # Ignore results from completely new and removed programs + if "N/A" in [verdict_new, verdict_old]: + continue + + if record[VeristatFields.VERDICT_DIFF] == "MISMATCH": + changes = True + add = True + verdict = f"{verdict_old} -> {verdict_new}" + if verdict_new == "failure": + new_failures = True + verdict += f" {NEW_FAILURE_SUFFIX}" + else: + verdict = record[VeristatFields.VERDICT_NEW] + + diff = get_state_diff(record[VeristatFields.TOTAL_STATES_DIFF]) + if abs(diff) > TRESHOLD_PCT: + changes = True + add = True + + if not add: + continue + + table.append( + [ + record[VeristatFields.FILE_NAME], + record[VeristatFields.PROG_NAME], + verdict, + f"{diff:+.2f} %", + ] + ) + + return VeristatInfo(table=table, changes=changes, new_failures=new_failures) + + +def github_markup_decorate(input_str: str) -> str: + for text, markup in GITHUB_MARKUP_REPLACEMENTS.items(): + input_str = input_str.replace(text, markup) + return input_str + + +def format_table(headers: List[str], rows: List[List[str]]) -> str: + column_width = [ + max(len(row[column_idx]) for row in [headers] + rows) + for column_idx in range(len(headers)) + ] + + # Row template string in the following format: + # "{0:8}|{1:10}|{2:15}|{3:7}|{4:10}" + row_template = "|".join( + f"{{{idx}:{width}}}" for idx, width in enumerate(column_width) + ) + row_template_nl = f"|{row_template}|\n" + + with io.StringIO() as out: + out.write(row_template_nl.format(*headers)) + + separator_row = ["-" * width for width in column_width] + out.write(row_template_nl.format(*separator_row)) + + for row in rows: + row_str = row_template_nl.format(*row) + out.write(row_str) + + return out.getvalue() + + +def main(compare_csv_filename: os.PathLike, output_filename: os.PathLike) -> None: + with open(compare_csv_filename, newline="", encoding="utf-8") as csv_file: + veristat_results = parse_table(csv_file) + + sys.stdout.write(veristat_results.get_results_summary()) + + with open(output_filename, encoding="utf-8", mode="a") as file: + file.write(veristat_results.get_results_summary(markup=True)) + + if veristat_results.new_failures: + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Print veristat comparison output as markdown step summary" + ) + parser.add_argument("filename") + args = parser.parse_args() + summary_filename = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_filename: + logging.error("GITHUB_STEP_SUMMARY environment variable is not set") + sys.exit(1) + sys.exit(main(args.filename, summary_filename)) diff --git a/.github/workflows/gcc-bpf.yml b/.github/workflows/gcc-bpf.yml new file mode 100644 index 0000000000000..8356a0af836f1 --- /dev/null +++ b/.github/workflows/gcc-bpf.yml @@ -0,0 +1,100 @@ +name: Testing GCC BPF compiler + +on: + workflow_call: + inputs: + runs_on: + required: true + type: string + arch: + required: true + type: string + llvm-version: + required: true + type: string + toolchain: + required: true + type: string + toolchain_full: + required: true + type: string + download_sources: + required: true + type: boolean + +jobs: + test: + name: GCC BPF + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARCH: ${{ inputs.arch }} + BPF_NEXT_BASE_BRANCH: 'master' + GCC_BPF_INSTALL_DIR: ${{ github.workspace }}/gcc-bpf + GCC_BPF_RELEASE_REPO: 'theihor/gcc-bpf' + KBUILD_OUTPUT: ${{ github.workspace }}/src/kbuild-output + REPO_ROOT: ${{ github.workspace }}/src + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + uses: libbpf/ci/get-linux-source@v3 + with: + dest: ${{ env.REPO_ROOT }} + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + + - if: ${{ ! inputs.download_sources }} + name: Checkout ${{ github.repository }} to ./src + uses: actions/checkout@v4 + with: + path: 'src' + + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: ${{ env.REPO_ROOT }} + + - name: Untar artifacts + working-directory: ${{ env.REPO_ROOT }} + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + + - name: Download GCC BPF compiler + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: .github/scripts/download-gcc-bpf.sh ${{ env.GCC_BPF_RELEASE_REPO }} ${{ env.GCC_BPF_INSTALL_DIR }} + + - name: Build selftests/bpf/test_progs-bpf_gcc + uses: libbpf/ci/build-selftests@v3 + env: + BPF_GCC: ${{ env.GCC_BPF_INSTALL_DIR }} + MAX_MAKE_JOBS: 32 + SELFTESTS_BPF_TARGETS: 'test_progs-bpf_gcc' + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.REPO_ROOT }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + diff --git a/.github/workflows/kernel-build-test.yml b/.github/workflows/kernel-build-test.yml new file mode 100644 index 0000000000000..541ef910b1d28 --- /dev/null +++ b/.github/workflows/kernel-build-test.yml @@ -0,0 +1,147 @@ +name: Reusable Build/Test/Veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + build_runs_on: + required: true + type: string + description: The runners to run the builds on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + tests: + required: true + type: string + description: A serialized json array with the tests to be running, it must follow the json-matrix format, https://www.jitsejan.com/use-github-actions-with-json-file-as-matrix + run_veristat: + required: true + type: boolean + description: Whether or not to run the veristat job. + run_tests: + required: true + type: boolean + description: Whether or not to run the test job. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + build_release: + required: true + type: boolean + description: Build selftests with -O2 optimization in addition to non-optimized build. + default: false + secrets: + AWS_ROLE_ARN: + required: true + +jobs: + + # Build kernel and selftest + build: + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + + build-release: + if: ${{ inputs.build_release }} + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + llvm-version: ${{ inputs.llvm-version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + release: true + + test: + if: ${{ inputs.run_tests }} + uses: ./.github/workflows/kernel-test.yml + # Setting name to test here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "test" + needs: [build] + strategy: + fail-fast: false + matrix: ${{ fromJSON(inputs.tests) }} + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + kernel: ${{ inputs.kernel }} + test: ${{ matrix.test }} + continue_on_error: ${{ toJSON(matrix.continue_on_error) }} + timeout_minutes: ${{ matrix.timeout_minutes }} + + veristat-kernel: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/veristat-kernel.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.runs_on }} + + veristat-meta: + # Check for vars.AWS_REGION is necessary to skip this job in case of a PR from a fork. + if: ${{ inputs.run_veristat && github.repository_owner == 'kernel-patches' && vars.AWS_REGION }} + uses: ./.github/workflows/veristat-meta.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + aws_region: ${{ vars.AWS_REGION }} + runs_on: ${{ inputs.runs_on }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + gcc-bpf: + name: 'GCC BPF' + if: ${{ inputs.arch == 'x86_64' }} + uses: ./.github/workflows/gcc-bpf.yml + needs: [build] + with: + # GCC BPF does not need /dev/kvm, so use the "build" runners + runs_on: ${{ inputs.build_runs_on }} + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + toolchain_full: ${{ inputs.toolchain_full }} + download_sources: ${{ inputs.download_sources }} + diff --git a/.github/workflows/kernel-build.yml b/.github/workflows/kernel-build.yml new file mode 100644 index 0000000000000..513d96411921e --- /dev/null +++ b/.github/workflows/kernel-build.yml @@ -0,0 +1,191 @@ + +name: Reusable build workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + llvm-version: + required: true + type: string + description: The version of LLVM used to build selftest.... for llvm toolchain, this should match the one from toolchain_full, for gcc it is an arbritrary version we decide to build selftests against. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + release: + required: false + type: boolean + description: Build selftest with -O2 optimization + default: false + +jobs: + build: + name: build for ${{ inputs.arch }} with ${{ inputs.toolchain_full }}${{ inputs.release && '-O2' || '' }} + # To run on CodeBuild, runs-on value must correspond to the AWS + # CodeBuild project associated with the kernel-patches webhook + # However matrix.py passes just a 'codebuild' string + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARTIFACTS_ARCHIVE: "vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst" + BPF_NEXT_BASE_BRANCH: 'master' + BPF_NEXT_FETCH_DEPTH: 64 # A bit of history is needed to facilitate incremental builds + CROSS_COMPILE: ${{ inputs.arch != 'x86_64' && 'true' || '' }} + # BUILD_SCHED_EXT_SELFTESTS: ${{ inputs.arch == 'x86_64' || inputs.arch == 'aarch64' && 'true' || '' }} + KBUILD_OUTPUT: ${{ github.workspace }}/kbuild-output + KERNEL: ${{ inputs.kernel }} + KERNEL_ROOT: ${{ github.workspace }} + REPO_PATH: "" + REPO_ROOT: ${{ github.workspace }} + RUNNER_TYPE: ${{ github.repository_owner == 'kernel-patches' && 'codebuild' || 'default' }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: ${{ inputs.download_sources && 1 || env.BPF_NEXT_FETCH_DEPTH }} + + - if: ${{ env.RUNNER_TYPE == 'codebuild' }} + shell: bash + run: .github/scripts/tmpfsify-workspace.sh + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + env: + FETCH_DEPTH: ${{ env.BPF_NEXT_FETCH_DEPTH }} + uses: libbpf/ci/get-linux-source@v3 + with: + dest: '.kernel' + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + - uses: libbpf/ci/prepare-incremental-build@v3 + with: + repo-root: ${{ inputs.download_sources && '.kernel' || env.REPO_ROOT }} + base-branch: >- + ${{ inputs.download_sources && env.BPF_NEXT_BASE_BRANCH + || github.event_name == 'pull_request' && github.base_ref + || github.ref_name + }} + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + - if: ${{ inputs.download_sources }} + name: Move linux source in place + shell: bash + run: | + cd .kernel + rm -rf .git + mv -t .. $(ls -A) + cd .. + rmdir .kernel + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + llvm-version: ${{ inputs.llvm-version }} + pahole: master + + # We have to setup qemu+binfmt in order to enable cross-compation of selftests. + # During selftests build, freshly built bpftool is executed. + # On self-hosted bare-metal hosts binfmt is pre-configured. + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Set up docker + uses: docker/setup-docker-action@v4 + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Setup binfmt and qemu + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:qemu-v9.2.0 + + - name: Build kernel image + uses: libbpf/ci/build-linux@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + + - name: Build selftests/bpf + uses: libbpf/ci/build-selftests@v3 + env: + MAX_MAKE_JOBS: 32 + RELEASE: ${{ inputs.release && '1' || '' }} + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.KERNEL_ROOT }} + llvm-version: ${{ inputs.llvm-version }} + toolchain: ${{ inputs.toolchain }} + + - if: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + name: Build selftests/sched_ext + uses: libbpf/ci/build-scx-selftests@v3 + with: + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + llvm-version: ${{ inputs.llvm-version }} + max-make-jobs: 32 + + - if: ${{ github.event_name != 'push' }} + name: Build samples + uses: libbpf/ci/build-samples@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm-version }} + - name: Tar artifacts + id: tar-artifacts + uses: libbpf/ci/tar-artifacts@v3 + env: + ARCHIVE_BPF_SELFTESTS: 'true' + ARCHIVE_MAKE_HELPERS: 'true' + ARCHIVE_SCHED_EXT_SELFTESTS: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + with: + arch: ${{ inputs.arch }} + archive: ${{ env.ARTIFACTS_ARCHIVE }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + - if: ${{ github.event_name != 'push' }} + name: Remove KBUILD_OUTPUT content + shell: bash + run: | + # Remove $KBUILD_OUTPUT to prevent cache creation for pull requests. + # Only on pushed changes are build artifacts actually cached, because + # of github.com/actions/cache's cache isolation logic. + rm -rf "${KBUILD_OUTPUT}" + - uses: actions/upload-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}${{ inputs.release && '-release' || '' }} + if-no-files-found: error + path: ${{ env.ARTIFACTS_ARCHIVE }} diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml new file mode 100644 index 0000000000000..6eda80362b9b0 --- /dev/null +++ b/.github/workflows/kernel-test.yml @@ -0,0 +1,88 @@ +name: Reusable test workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + test: + required: true + type: string + description: The test to run in the vm, e.g test_progs, test_maps, test_progs_no_alu32... + continue_on_error: + required: true + type: string + description: Whether to continue on error. This is typically set to true for parallel tests which are currently known to fail, but we don't want to fail the whole CI because of that. + timeout_minutes: + required: true + type: number + description: In case a test runs for too long, after how many seconds shall we timeout and error. + +jobs: + test: + name: ${{ inputs.test }} on ${{ inputs.arch }} with ${{ inputs.toolchain_full }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + ARCH: ${{ inputs.arch }} + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + CONTINUE_ON_ERROR: ${{ inputs.continue_on_error }} + DEPLOYMENT: ${{ github.repository == 'kernel-patches/bpf' && 'prod' || 'rc' }} + ALLOWLIST_FILE: /tmp/allowlist + DENYLIST_FILE: /tmp/denylist + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: . + + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Run selftests + uses: libbpf/ci/run-vmtest@v3 + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + continue-on-error: ${{ fromJSON(env.CONTINUE_ON_ERROR) }} + timeout-minutes: ${{ inputs.timeout_minutes }} + env: + ARCH: ${{ inputs.arch }} + DEPLOYMENT: ${{ env.DEPLOYMENT }} + KERNEL_TEST: ${{ inputs.test }} + SELFTESTS_BPF: ${{ github.workspace }}/selftests/bpf + VMTEST_CONFIGS: ${{ github.workspace }}/ci/vmtest/configs + TEST_PROGS_WATCHDOG_TIMEOUT: 600 + with: + arch: ${{ inputs.arch }} + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: ${{ env.REPO_ROOT }} + max-cpu: 8 + kernel-test: ${{ inputs.test }} + # Here we must use kbuild-output local to the repo, because + # it was extracted from the artifacts. + kbuild-output: ${{ env.REPO_ROOT }}/kbuild-output diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000..1c910fd297309 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +name: "lint" + +on: + pull_request: + push: + branches: + - master + +jobs: + shellcheck: + # This workflow gets injected into other Linux repositories, but we don't + # want it to run there. + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: ShellCheck + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + env: + SHELLCHECK_OPTS: --severity=warning --exclude=SC1091 + + # Ensure some consistency in the formatting. + lint: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run black + uses: psf/black@stable + with: + src: ./.github/scripts + + validate_matrix: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Validate matrix.py + runs-on: ubuntu-latest + env: + GITHUB_REPOSITORY_OWNER: ${{ matrix.owner }} + GITHUB_REPOSITORY: ${{ matrix.repository }} + GITHUB_OUTPUT: /dev/stdout + strategy: + matrix: + owner: ['kernel-patches', 'foo'] + repository: ['bpf', 'vmtest', 'bar'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: run script + run: | + python3 .github/scripts/matrix.py + + unittests: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Unittests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run unittests + run: python3 -m unittest scripts/tests/*.py + working-directory: .github diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000000..fdf9d9f58253f --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,62 @@ +name: bpf-ci + +on: + pull_request: + push: + branches: + - bpf_base + - bpf-next_base + +concurrency: + group: ci-test-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + set-matrix: + # FIXME: set-matrix is lightweight, run it on any self-hosted machines for kernel-patches org + # so we do not wait for GH hosted runners when there potentially all are busy because of bpf-rc + # repo for instance. + # This could be somehow fixed long term by making this action/workflow re-usable and letting the called + # specify what to run on. + runs-on: ${{ github.repository_owner == 'kernel-patches' && 'x86_64' || 'ubuntu-latest' }} + outputs: + build-matrix: ${{ steps.set-matrix-impl.outputs.build_matrix }} + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + - id: set-matrix-impl + run: | + python3 .github/scripts/matrix.py + + build-and-test: + # Setting name to arch-compiler here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "${{ matrix.arch }}-${{ matrix.toolchain.fullname }}" + uses: ./.github/workflows/kernel-build-test.yml + needs: [set-matrix] + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.build-matrix) }} + with: + arch: ${{ matrix.arch }} + toolchain_full: ${{ matrix.toolchain.fullname }} + toolchain: ${{ matrix.toolchain.name }} + runs_on: ${{ toJSON(matrix.runs_on) }} + build_runs_on: ${{ toJSON(matrix.build_runs_on) }} + llvm-version: ${{ matrix.toolchain.version }} + kernel: ${{ matrix.kernel }} + tests: ${{ toJSON(matrix.tests) }} + run_veristat: ${{ matrix.run_veristat }} + # We only run tests on pull requests. + run_tests: ${{ github.event_name != 'push' }} + # Download sources + download_sources: ${{ github.repository == 'kernel-patches/vmtest' }} + build_release: ${{ matrix.build_release }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/.github/workflows/veristat-kernel.yml b/.github/workflows/veristat-kernel.yml new file mode 100644 index 0000000000000..0557daebf5bab --- /dev/null +++ b/.github/workflows/veristat-kernel.yml @@ -0,0 +1,65 @@ +name: veristat_kernel + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + +jobs: + veristat: + name: ${{ inputs.arch }}-${{ inputs.toolchain }} veristat_kernel + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_kernel' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.kernel.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-kernel + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-kernel + diff --git a/.github/workflows/veristat-meta.yml b/.github/workflows/veristat-meta.yml new file mode 100644 index 0000000000000..00700a8205e62 --- /dev/null +++ b/.github/workflows/veristat-meta.yml @@ -0,0 +1,87 @@ +name: veristat_meta + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + aws_region: + required: true + type: string + description: The AWS region where we pull bpf objects to run against veristat. + secrets: + AWS_ROLE_ARN: + required: true + description: The AWS role used by GH to pull BPF objects from AWS. + +jobs: + veristat: + name: ${{ inputs.arch }}-${{ inputs.toolchain }} veristat_meta + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain }} + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-region: ${{ inputs.aws_region }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + role-session-name: github-action-bpf-ci + + - name: Download BPF objects + run: | + mkdir ./bpf_objects + aws s3 sync s3://veristat-bpf-binaries ./bpf_objects + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat_meta' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.meta.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-meta + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-meta + diff --git a/.mailmap b/.mailmap index a897c16d3baef..72bf830e1e588 100644 --- a/.mailmap +++ b/.mailmap @@ -88,7 +88,6 @@ Antonio Quartulli Antonio Quartulli Antonio Quartulli Antonio Quartulli -Antonio Quartulli Antonio Quartulli Anup Patel Archit Taneja @@ -522,6 +521,7 @@ Nadav Amit Nadia Yvette Chambers William Lee Irwin III Naoya Horiguchi Naoya Horiguchi +Natalie Vock Nathan Chancellor Naveen N Rao Naveen N Rao @@ -613,6 +613,8 @@ Richard Leitner Richard Leitner Robert Foss Rocky Liao +Rodrigo Siqueira +Rodrigo Siqueira Roman Gushchin Roman Gushchin Roman Gushchin @@ -689,6 +691,7 @@ Subbaraman Narayanamurthy Subhash Jadavani Sudarshan Rajagopalan Sudeep Holla Sudeep KarkadaNagesha +Sumit Garg Sumit Semwal Surabhi Vishnoi Sven Eckelmann diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index eb94526689091..b557cf1c820d2 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -176,7 +176,7 @@ Configuring the kernel values without prompting. "make defconfig" Create a ./.config file by using the default - symbol values from either arch/$ARCH/defconfig + symbol values from either arch/$ARCH/configs/defconfig or arch/$ARCH/configs/${PLATFORM}_defconfig, depending on the architecture. diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst index 2ad1c05b8c883..66af95251a3d1 100644 --- a/Documentation/admin-guide/hw-vuln/srso.rst +++ b/Documentation/admin-guide/hw-vuln/srso.rst @@ -104,7 +104,20 @@ The possible values in this file are: (spec_rstack_overflow=ibpb-vmexit) + * 'Mitigation: Reduced Speculation': + This mitigation gets automatically enabled when the above one "IBPB on + VMEXIT" has been selected and the CPU supports the BpSpecReduce bit. + + It gets automatically enabled on machines which have the + SRSO_USER_KERNEL_NO=1 CPUID bit. In that case, the code logic is to switch + to the above =ibpb-vmexit mitigation because the user/kernel boundary is + not affected anymore and thus "safe RET" is not needed. + + After enabling the IBPB on VMEXIT mitigation option, the BpSpecReduce bit + is detected (functionality present on all such machines) and that + practically overrides IBPB on VMEXIT as it has a lot less performance + impact and takes care of the guest->host attack vector too. In order to exploit vulnerability, an attacker needs to: diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 5376890adbeb0..1f7f14c6e184c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -180,10 +180,6 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) 1) On i386, enable high memory support under "Processor type and features":: - CONFIG_HIGHMEM64G=y - - or:: - CONFIG_HIGHMEM4G 2) With CONFIG_SMP=y, usually nr_cpus=1 need specified on the kernel diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec85..3e5e41cbe3ce9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -416,10 +416,6 @@ Format: { quiet (default) | verbose | debug } Change the amount of debugging information output when initialising the APIC and IO-APIC components. - For X86-32, this can also be used to specify an APIC - driver name. - Format: apic=driver_name - Examples: apic=bigsmp apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } @@ -6582,6 +6578,8 @@ Selecting 'on' will also enable the mitigation against user space to user space task attacks. + Selecting specific mitigation does not force enable + user mitigations. Selecting 'off' will disable both the kernel and the user space protections. @@ -7672,13 +7670,6 @@ 16 - SIGBUS faults Example: user_debug=31 - userpte= - [X86,EARLY] Flags controlling user PTE allocations. - - nohigh = do not allocate PTE pages in - HIGHMEM regardless of setting - of CONFIG_HIGHPTE. - vdso= [X86,SH,SPARC] On X86_32, this is an alias for vdso32=. Otherwise: diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a43b78b4b6464..dd49a89a62d35 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -212,6 +212,17 @@ pid>/``). This value defaults to 0. +core_sort_vma +============= + +The default coredump writes VMAs in address order. By setting +``core_sort_vma`` to 1, VMAs will be written from smallest size +to largest size. This is known to break at least elfutils, but +can be handy when dealing with very large (and truncated) +coredumps where the more useful debugging details are included +in the smaller VMAs. + + core_uses_pid ============= diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index f074f6219f5c3..f968c13b46a78 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -284,6 +284,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Rockchip | RK3588 | #3588001 | ROCKCHIP_ERRATUM_3588001 | +----------------+-----------------+-----------------+-----------------------------+ +| Rockchip | RK3568 | #3568002 | ROCKCHIP_ERRATUM_3568002 | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arch/powerpc/cxl.rst b/Documentation/arch/powerpc/cxl.rst index d2d77057610e4..778adda740d24 100644 --- a/Documentation/arch/powerpc/cxl.rst +++ b/Documentation/arch/powerpc/cxl.rst @@ -18,6 +18,7 @@ Introduction both access system memory directly and with the same effective addresses. + **This driver is deprecated and will be removed in a future release.** Hardware overview ================= @@ -453,7 +454,7 @@ Sysfs Class A cxl sysfs class is added under /sys/class/cxl to facilitate enumeration and tuning of the accelerators. Its layout is - described in Documentation/ABI/testing/sysfs-class-cxl + described in Documentation/ABI/obsolete/sysfs-class-cxl Udev rules diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index 76f53d3450e73..77e6163288db0 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -1038,16 +1038,6 @@ Offset/size: 0x000c/4 This field contains maximal allowed type for setup_data and setup_indirect structs. -The Image Checksum -================== - -From boot protocol version 2.08 onwards the CRC-32 is calculated over -the entire file using the characteristic polynomial 0x04C11DB7 and an -initial remainder of 0xffffffff. The checksum is appended to the -file; therefore the CRC of the file up to the limit specified in the -syssize field of the header is always 0. - - The Kernel Command Line ======================= diff --git a/Documentation/arch/x86/sva.rst b/Documentation/arch/x86/sva.rst index 33cb050059820..6a759984d4712 100644 --- a/Documentation/arch/x86/sva.rst +++ b/Documentation/arch/x86/sva.rst @@ -25,7 +25,7 @@ to cache translations for virtual addresses. The IOMMU driver uses the mmu_notifier() support to keep the device TLB cache and the CPU cache in sync. When an ATS lookup fails for a virtual address, the device should use the PRI in order to request the virtual address to be paged into the -CPU page tables. The device must use ATS again in order the fetch the +CPU page tables. The device must use ATS again in order to fetch the translation before use. Shared Hardware Workqueues @@ -216,7 +216,7 @@ submitting work and processing completions. Single Root I/O Virtualization (SR-IOV) focuses on providing independent hardware interfaces for virtualizing hardware. Hence, it's required to be -almost fully functional interface to software supporting the traditional +an almost fully functional interface to software supporting the traditional BARs, space for interrupts via MSI-X, its own register layout. Virtual Functions (VFs) are assisted by the Physical Function (PF) driver. diff --git a/Documentation/arch/x86/usb-legacy-support.rst b/Documentation/arch/x86/usb-legacy-support.rst index e01c08b7c981c..b17bf122270aa 100644 --- a/Documentation/arch/x86/usb-legacy-support.rst +++ b/Documentation/arch/x86/usb-legacy-support.rst @@ -20,11 +20,7 @@ It has several drawbacks, though: features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may not be available. -2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause - system crashes, because the SMM BIOS is not expecting to be in PAE mode. - The Intel E7505 is a typical machine where this happens. - -3) If AMD64 64-bit mode is enabled, again system crashes often happen, +2) If AMD64 64-bit mode is enabled, again system crashes often happen, because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit yet. @@ -38,11 +34,6 @@ Problem 1) compiled-in, too. Problem 2) - can currently only be solved by either disabling HIGHMEM64G - in the kernel config or USB Legacy support in the BIOS. A BIOS update - could help, but so far no such update exists. - -Problem 3) is usually fixed by a BIOS update. Check the board manufacturers web site. If an update is not available, disable USB Legacy support in the BIOS. If this alone doesn't help, try also adding diff --git a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml index 932f981265ccb..52016a141227b 100644 --- a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml +++ b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml @@ -53,11 +53,17 @@ properties: reg: maxItems: 1 + power-controller: + type: object + + reboot-mode: + type: object + required: - compatible - reg -additionalProperties: true +additionalProperties: false examples: - | diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7606.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7606.yaml index ab5881d0d017f..52d3f1ce33678 100644 --- a/Documentation/devicetree/bindings/iio/adc/adi,ad7606.yaml +++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7606.yaml @@ -146,6 +146,7 @@ properties: maxItems: 2 pwm-names: + minItems: 1 items: - const: convst1 - const: convst2 diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml index f49b43f45f3d9..06e3621a8c06c 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml @@ -26,6 +26,7 @@ properties: deprecated: true - const: allwinner,sun7i-a20-sc-nmi - const: allwinner,sun9i-a80-nmi + - const: allwinner,sun55i-a523-nmi - items: - enum: - allwinner,sun8i-v3s-nmi diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml index d7ef4f1323a7a..3f99c8645767c 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml @@ -4,7 +4,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/renesas,rzv2h-icu.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Renesas RZ/V2H(P) Interrupt Control Unit +title: Renesas RZ/{G3E,V2H(P)} Interrupt Control Unit maintainers: - Fabrizio Castro @@ -20,7 +20,9 @@ description: properties: compatible: - const: renesas,r9a09g057-icu # RZ/V2H(P) + enum: + - renesas,r9a09g047-icu # RZ/G3E + - renesas,r9a09g057-icu # RZ/V2H(P) '#interrupt-cells': description: The first cell is the SPI number of the NMI or the diff --git a/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml index 190a6499c932e..bef00521d5dac 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml @@ -91,6 +91,14 @@ properties: Firmware must configure interrupt delegation registers based on interrupt delegation list. + riscv,hart-indexes: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 16384 + description: + A list of hart indexes that APLIC should use to address each hart + that is mentioned in the "interrupts-extended" + dependencies: riscv,delegation: [ "riscv,children" ] diff --git a/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml b/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml new file mode 100644 index 0000000000000..e1ffd55fa7bf8 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/sophgo,sg2042-msi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sophgo SG2042 MSI Controller + +maintainers: + - Chen Wang + +description: + This interrupt controller is in Sophgo SG2042 for transforming interrupts from + PCIe MSI to PLIC interrupts. + +allOf: + - $ref: /schemas/interrupt-controller/msi-controller.yaml# + +properties: + compatible: + const: sophgo,sg2042-msi + + reg: + items: + - description: clear register + - description: msi doorbell address + + reg-names: + items: + - const: clr + - const: doorbell + + msi-controller: true + + msi-ranges: + maxItems: 1 + + "#msi-cells": + const: 0 + +required: + - compatible + - reg + - reg-names + - msi-controller + - msi-ranges + - "#msi-cells" + +unevaluatedProperties: false + +examples: + - | + #include + msi-controller@30000000 { + compatible = "sophgo,sg2042-msi"; + reg = <0x30000000 0x4>, <0x30000008 0x4>; + reg-names = "clr", "doorbell"; + msi-controller; + #msi-cells = <0>; + msi-ranges = <&plic 64 IRQ_TYPE_LEVEL_HIGH 32>; + }; diff --git a/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml b/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml index 0bed37a994c38..e1f4d7c35a885 100644 --- a/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml +++ b/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml @@ -33,6 +33,10 @@ properties: clocks: maxItems: 1 + clock-names: + items: + - const: nf_clk + dmas: maxItems: 1 @@ -51,6 +55,7 @@ required: - reg-names - interrupts - clocks + - clock-names unevaluatedProperties: false @@ -66,7 +71,8 @@ examples: #address-cells = <1>; #size-cells = <0>; interrupts = ; - clocks = <&nf_clk>; + clocks = <&clk>; + clock-names = "nf_clk"; cdns,board-delay-ps = <4830>; nand@0 { diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst index 77930c77fcfe6..2a206129f8284 100644 --- a/Documentation/filesystems/idmappings.rst +++ b/Documentation/filesystems/idmappings.rst @@ -63,8 +63,8 @@ what id ``k11000`` corresponds to in the second or third idmapping. The straightforward algorithm to use is to apply the inverse of the first idmapping, mapping ``k11000`` up to ``u1000``. Afterwards, we can map ``u1000`` down using either the second idmapping mapping or third idmapping mapping. The second -idmapping would map ``u1000`` down to ``21000``. The third idmapping would map -``u1000`` down to ``u31000``. +idmapping would map ``u1000`` down to ``k21000``. The third idmapping would map +``u1000`` down to ``k31000``. If we were given the same task for the following three idmappings:: diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst index 80b05a3009ea2..ab464335d3204 100644 --- a/Documentation/scheduler/sched-rt-group.rst +++ b/Documentation/scheduler/sched-rt-group.rst @@ -102,6 +102,9 @@ The system wide settings are configured under the /proc virtual file system: * sched_rt_period_us takes values from 1 to INT_MAX. * sched_rt_runtime_us takes values from -1 to sched_rt_period_us. * A run time of -1 specifies runtime == period, ie. no limit. + * sched_rt_runtime_us/sched_rt_period_us > 0.05 inorder to preserve + bandwidth for fair dl_server. For accurate value check average of + runtime/period in /sys/kernel/debug/sched/fair_server/cpuX/ 2.2 Default behaviour diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst index d639c61cb472a..ad587f53fe417 100644 --- a/Documentation/userspace-api/landlock.rst +++ b/Documentation/userspace-api/landlock.rst @@ -8,7 +8,7 @@ Landlock: unprivileged access control ===================================== :Author: Mickaël Salaün -:Date: October 2024 +:Date: January 2025 The goal of Landlock is to enable restriction of ambient rights (e.g. global filesystem or network access) for a set of processes. Because Landlock @@ -329,11 +329,11 @@ non-sandboxed process, we can specify this restriction with A sandboxed process can connect to a non-sandboxed process when its domain is not scoped. If a process's domain is scoped, it can only connect to sockets created by processes in the same scope. -Moreover, If a process is scoped to send signal to a non-scoped process, it can +Moreover, if a process is scoped to send signal to a non-scoped process, it can only send signals to processes in the same scope. A connected datagram socket behaves like a stream socket when its domain is -scoped, meaning if the domain is scoped after the socket is connected , it can +scoped, meaning if the domain is scoped after the socket is connected, it can still :manpage:`send(2)` data just like a stream socket. However, in the same scenario, a non-connected datagram socket cannot send data (with :manpage:`sendto(2)`) outside its scope. diff --git a/MAINTAINERS b/MAINTAINERS index 3864d473f52f2..bf724b30203d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1046,14 +1046,14 @@ F: drivers/crypto/ccp/hsti.* AMD DISPLAY CORE M: Harry Wentland M: Leo Li -M: Rodrigo Siqueira +R: Rodrigo Siqueira L: amd-gfx@lists.freedesktop.org S: Supported T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/display/ AMD DISPLAY CORE - DML -M: Chaitanya Dhere +M: Austin Zheng M: Jun Lei S: Supported F: drivers/gpu/drm/amd/display/dc/dml/ @@ -2210,6 +2210,7 @@ F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT M: Sven Peter +M: Janne Grunau R: Alyssa Rosenzweig L: asahi@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -2284,7 +2285,7 @@ F: drivers/irqchip/irq-aspeed-i2c-ic.c ARM/ASPEED MACHINE SUPPORT M: Joel Stanley -R: Andrew Jeffery +M: Andrew Jeffery L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-aspeed@lists.ozlabs.org (moderated for non-subscribers) S: Supported @@ -2877,7 +2878,7 @@ F: drivers/pinctrl/nxp/ ARM/NXP S32G/S32R DWMAC ETHERNET DRIVER M: Jan Petrous -L: NXP S32 Linux Team +R: s32@nxp.com S: Maintained F: Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml F: drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -5774,6 +5775,7 @@ X: drivers/clk/clkdev.c COMMON INTERNET FILE SYSTEM CLIENT (CIFS and SMB3) M: Steve French +M: Steve French R: Paulo Alcantara (DFS, global name space) R: Ronnie Sahlberg (directory leases, sparse files) R: Shyam Prasad N (multichannel) @@ -5855,7 +5857,6 @@ F: Documentation/security/snp-tdx-threat-model.rst CONFIGFS M: Joel Becker -M: Christoph Hellwig S: Supported T: git git://git.infradead.org/users/hch/configfs.git F: fs/configfs/ @@ -5926,6 +5927,17 @@ F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +CONTROL GROUP - DEVICE MEMORY CONTROLLER (DMEM) +M: Maarten Lankhorst +M: Maxime Ripard +M: Natalie Vock +L: cgroups@vger.kernel.org +L: dri-devel@lists.freedesktop.org +S: Maintained +T: git https://gitlab.freedesktop.org/drm/misc/kernel.git +F: include/linux/cgroup_dmem.h +F: kernel/cgroup/dmem.c + CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko @@ -6878,7 +6890,6 @@ F: kernel/dma/map_benchmark.c F: tools/testing/selftests/dma/ DMA MAPPING HELPERS -M: Christoph Hellwig M: Marek Szyprowski R: Robin Murphy L: iommu@lists.linux.dev @@ -7425,7 +7436,6 @@ F: Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS -M: Karol Herbst M: Lyude Paul M: Danilo Krummrich L: dri-devel@lists.freedesktop.org @@ -9433,14 +9443,11 @@ F: include/linux/fscrypt.h F: include/uapi/linux/fscrypt.h FSI SUBSYSTEM -M: Jeremy Kerr -M: Joel Stanley -R: Alistar Popple -R: Eddie James +M: Eddie James +R: Ninad Palsule L: linux-fsi@lists.ozlabs.org S: Supported Q: http://patchwork.ozlabs.org/project/linux-fsi/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/joel/fsi.git F: drivers/fsi/ F: include/linux/fsi*.h F: include/trace/events/fsi*.h @@ -9771,6 +9778,7 @@ F: include/asm-generic/vdso/vsyscall.h F: include/vdso/ F: kernel/time/vsyscall.c F: lib/vdso/ +F: tools/testing/selftests/vDSO/ GENWQE (IBM Generic Workqueue Card) M: Frank Haverkamp @@ -12646,7 +12654,9 @@ F: tools/testing/selftests/ KERNEL SMB3 SERVER (KSMBD) M: Namjae Jeon +M: Namjae Jeon M: Steve French +M: Steve French R: Sergey Senozhatsky R: Tom Talpey L: linux-cifs@vger.kernel.org @@ -12863,7 +12873,7 @@ F: include/keys/trusted_dcp.h F: security/keys/trusted-keys/trusted_dcp.c KEYS-TRUSTED-TEE -M: Sumit Garg +M: Sumit Garg L: linux-integrity@vger.kernel.org L: keyrings@vger.kernel.org S: Supported @@ -15682,7 +15692,7 @@ F: include/uapi/linux/cciss*.h MICROSOFT MANA RDMA DRIVER M: Long Li -M: Ajay Sharma +M: Konstantin Taranov L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/mana/ @@ -17663,7 +17673,7 @@ F: Documentation/ABI/testing/sysfs-bus-optee-devices F: drivers/tee/optee/ OP-TEE RANDOM NUMBER GENERATOR (RNG) DRIVER -M: Sumit Garg +M: Sumit Garg L: op-tee@lists.trustedfirmware.org S: Maintained F: drivers/char/hw_random/optee-rng.c @@ -19657,7 +19667,6 @@ F: drivers/net/wireless/quantenna RADEON and AMDGPU DRM DRIVERS M: Alex Deucher M: Christian König -M: Xinhui Pan L: amd-gfx@lists.freedesktop.org S: Supported B: https://gitlab.freedesktop.org/drm/amd/-/issues @@ -19879,7 +19888,7 @@ F: net/rds/ F: tools/testing/selftests/net/rds/ RDT - RESOURCE ALLOCATION -M: Fenghua Yu +M: Tony Luck M: Reinette Chatre L: linux-kernel@vger.kernel.org S: Supported @@ -20330,6 +20339,7 @@ RISC-V ARCHITECTURE M: Paul Walmsley M: Palmer Dabbelt M: Albert Ou +R: Alexandre Ghiti L: linux-riscv@lists.infradead.org S: Supported Q: https://patchwork.kernel.org/project/linux-riscv/list/ @@ -21923,10 +21933,13 @@ F: sound/soc/uniphier/ SOCKET TIMESTAMPING M: Willem de Bruijn +R: Jason Xing S: Maintained F: Documentation/networking/timestamping.rst F: include/linux/net_tstamp.h F: include/uapi/linux/net_tstamp.h +F: tools/testing/selftests/bpf/*/net_timestamping* +F: tools/testing/selftests/net/*timestamp* F: tools/testing/selftests/net/so_txtime.c SOEKRIS NET48XX LED SUPPORT @@ -23273,7 +23286,7 @@ F: include/media/i2c/tw9910.h TEE SUBSYSTEM M: Jens Wiklander -R: Sumit Garg +R: Sumit Garg L: op-tee@lists.trustedfirmware.org S: Maintained F: Documentation/ABI/testing/sysfs-class-tee @@ -24069,7 +24082,6 @@ F: tools/testing/selftests/ftrace/ TRACING MMIO ACCESSES (MMIOTRACE) M: Steven Rostedt M: Masami Hiramatsu -R: Karol Herbst R: Pekka Paalanen L: linux-kernel@vger.kernel.org L: nouveau@lists.freedesktop.org diff --git a/Makefile b/Makefile index 96407c1d6be16..bd89d8b8c49b8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* @@ -1014,6 +1014,9 @@ CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers endif +ifdef CONFIG_FINEIBT_BHI + CC_FLAGS_CFI += -fsanitize-kcfi-arity +endif ifdef CONFIG_RUST # Always pass -Zsanitizer-cfi-normalize-integers as CONFIG_RUST selects # CONFIG_CFI_ICALL_NORMALIZE_INTEGERS. @@ -1123,6 +1126,11 @@ endif KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) +# userspace programs are linked via the compiler, use the correct linker +ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_LD_IS_LLD),yy) +KBUILD_USERLDFLAGS += --ld-path=$(LD) +endif + # make the checker run with the right architecture CHECKFLAGS += --arch=$(ARCH) diff --git a/README b/README index fd903645e6de0..e69de29bb2d1d 100644 --- a/README +++ b/README @@ -1,18 +0,0 @@ -Linux kernel -============ - -There are several guides for kernel developers and users. These guides can -be rendered in a number of formats, like HTML and PDF. Please read -Documentation/admin-guide/README.rst first. - -In order to build the documentation, use ``make htmldocs`` or -``make pdfdocs``. The formatted documentation can also be read online at: - - https://www.kernel.org/doc/html/latest/ - -There are various text files in the Documentation/ subdirectory, -several of them using the reStructuredText markup notation. - -Please read the Documentation/process/changes.rst file, as it contains the -requirements for building and running the kernel, and information about -the problems which may result by upgrading your kernel. diff --git a/arch/Kconfig b/arch/Kconfig index b8a4ff3655822..9f6eb09ef12d7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1584,6 +1584,10 @@ config HAVE_SPARSE_SYSCALL_NR entries at 4000, 5000 and 6000 locations. This option turns on syscall related optimizations for a given architecture. +config ARCH_HAS_VDSO_ARCH_DATA + depends on GENERIC_VDSO_DATA_STORE + bool + config ARCH_HAS_VDSO_TIME_DATA bool diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 1815748f5d2ac..bae5edf348efa 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -381,7 +381,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size); void iounmap(volatile void __iomem *io_addr); #define iounmap iounmap -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); #define arch_memremap_wb arch_memremap_wb /* diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h index 5b85889f82eeb..88364a6727ffc 100644 --- a/arch/arm/include/asm/vdso.h +++ b/arch/arm/include/asm/vdso.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#define __VDSO_PAGES 4 + #ifndef __ASSEMBLY__ struct mm_struct; diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 592d3d015ca72..1e9f81639c88c 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -112,7 +112,7 @@ static inline bool arm_vdso_hres_capable(void) #define __arch_vdso_hres_capable arm_vdso_hres_capable static __always_inline u64 __arch_get_hw_counter(int clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { #ifdef CONFIG_ARM_ARCH_TIMER u64 cycle_now; @@ -135,11 +135,6 @@ static __always_inline u64 __arch_get_hw_counter(int clock_mode, #endif } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 705414710dcdb..4e7226ad02ec4 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,22 +7,14 @@ #include #include -extern struct vdso_data *vdso_data; extern bool cntvct_ok; static __always_inline -struct vdso_data *__arm_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __arm_get_k_vdso_data - -static __always_inline -void __arm_sync_vdso_data(struct vdso_data *vdata) +void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { flush_dcache_page(virt_to_page(vdata)); } -#define __arch_sync_vdso_data __arm_sync_vdso_data +#define __arch_sync_vdso_time_data __arch_sync_vdso_time_data /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 4853875740d0f..123f4a8ef4466 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -153,10 +153,6 @@ int main(void) DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER); DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE); BLANK(); -#ifdef CONFIG_VDSO - DEFINE(VDSO_DATA_SIZE, sizeof(union vdso_data_store)); -#endif - BLANK(); #ifdef CONFIG_ARM_MPU DEFINE(MPU_RNG_INFO_RNGS, offsetof(struct mpu_rgn_info, rgns)); DEFINE(MPU_RNG_INFO_USED, offsetof(struct mpu_rgn_info, used)); diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 29dd2f3c62fec..325448ffbba0c 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -33,15 +34,6 @@ extern char vdso_start[], vdso_end[]; /* Total number of pages needed for the data and text portions of the VDSO. */ unsigned int vdso_total_pages __ro_after_init; -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - -static struct page *vdso_data_page __ro_after_init; -static const struct vm_special_mapping vdso_data_mapping = { - .name = "[vvar]", - .pages = &vdso_data_page, -}; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -192,9 +184,6 @@ static int __init vdso_init(void) if (vdso_text_pagelist == NULL) return -ENOMEM; - /* Grab the VDSO data page. */ - vdso_data_page = virt_to_page(vdso_data); - /* Grab the VDSO text pages. */ for (i = 0; i < text_pages; i++) { struct page *page; @@ -205,7 +194,7 @@ static int __init vdso_init(void) vdso_text_mapping.pages = vdso_text_pagelist; - vdso_total_pages = 1; /* for the data/vvar page */ + vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; cntvct_ok = cntvct_functional(); @@ -216,16 +205,7 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -static int install_vvar(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma; - - vma = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ | VM_MAYREAD, - &vdso_data_mapping); - - return PTR_ERR_OR_ZERO(vma); -} +static_assert(__VDSO_PAGES == VDSO_NR_PAGES); /* assumes mmap_lock is write-locked */ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) @@ -238,12 +218,12 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) if (vdso_text_pagelist == NULL) return; - if (install_vvar(mm, addr)) + if (IS_ERR(vdso_install_vvar_mapping(mm, addr))) return; - /* Account for vvar page. */ - addr += PAGE_SIZE; - len = (vdso_total_pages - 1) << PAGE_SHIFT; + /* Account for vvar pages. */ + addr += VDSO_NR_PAGES * PAGE_SIZE; + len = (vdso_total_pages - VDSO_NR_PAGES) << PAGE_SHIFT; vma = _install_special_mapping(mm, addr, len, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c index 2e497745b6246..a044ea5cb4f18 100644 --- a/arch/arm/mach-davinci/da830.c +++ b/arch/arm/mach-davinci/da830.c @@ -11,7 +11,6 @@ #include #include #include -#include #include diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c index e898f7c2733ef..94e4f4a2f73fc 100644 --- a/arch/arm/mach-imx/mmdc.c +++ b/arch/arm/mach-imx/mmdc.c @@ -509,9 +509,8 @@ static int imx_mmdc_perf_init(struct platform_device *pdev, void __iomem *mmdc_b pmu_mmdc->mmdc_ipg_clk = mmdc_ipg_clk; pmu_mmdc->devtype_data = device_get_match_data(&pdev->dev); - hrtimer_init(&pmu_mmdc->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - pmu_mmdc->hrtimer.function = mmdc_pmu_timer_handler; + hrtimer_setup(&pmu_mmdc->hrtimer, mmdc_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cpumask_set_cpu(raw_smp_processor_id(), &pmu_mmdc->cpu); diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 2b6f50dd54784..5c1023a6d78c1 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -928,6 +928,7 @@ config VDSO select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c index 993fefdc167a8..93ef0502b7ff2 100644 --- a/arch/arm/mm/cache-l2x0-pmu.c +++ b/arch/arm/mm/cache-l2x0-pmu.c @@ -539,8 +539,7 @@ static __init int l2x0_pmu_init(void) * at higher frequencies. */ l2x0_pmu_poll_period = ms_to_ktime(1000); - hrtimer_init(&l2x0_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - l2x0_pmu_hrtimer.function = l2x0_pmu_poll; + hrtimer_setup(&l2x0_pmu_hrtimer, l2x0_pmu_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cpumask_set_cpu(0, &pmu_cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE, diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 2bec87c3327d2..39fd5df733178 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -62,7 +62,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, } static int adjust_pte(struct vm_area_struct *vma, unsigned long address, - unsigned long pfn, struct vm_fault *vmf) + unsigned long pfn, bool need_lock) { spinlock_t *ptl; pgd_t *pgd; @@ -99,12 +99,11 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (!pte) return 0; - /* - * If we are using split PTE locks, then we need to take the page - * lock here. Otherwise we are using shared mm->page_table_lock - * which is already locked, thus cannot take it. - */ - if (ptl != vmf->ptl) { + if (need_lock) { + /* + * Use nested version here to indicate that we are already + * holding one similar spinlock. + */ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { pte_unmap_unlock(pte, ptl); @@ -114,7 +113,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, ret = do_adjust_pte(vma, address, pfn, pte); - if (ptl != vmf->ptl) + if (need_lock) spin_unlock(ptl); pte_unmap(pte); @@ -123,9 +122,10 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, static void make_coherent(struct address_space *mapping, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, unsigned long pfn, - struct vm_fault *vmf) + unsigned long addr, pte_t *ptep, unsigned long pfn) { + const unsigned long pmd_start_addr = ALIGN_DOWN(addr, PMD_SIZE); + const unsigned long pmd_end_addr = pmd_start_addr + PMD_SIZE; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; unsigned long offset; @@ -141,6 +141,14 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, */ flush_dcache_mmap_lock(mapping); vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { + /* + * If we are using split PTE locks, then we need to take the pte + * lock. Otherwise we are using shared mm->page_table_lock which + * is already locked, thus cannot take it. + */ + bool need_lock = IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS); + unsigned long mpnt_addr; + /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally mask out the VMA @@ -151,7 +159,12 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, if (!(mpnt->vm_flags & VM_MAYSHARE)) continue; offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf); + mpnt_addr = mpnt->vm_start + offset; + + /* Avoid deadlocks by not grabbing the same PTE lock again. */ + if (mpnt_addr >= pmd_start_addr && mpnt_addr < pmd_end_addr) + need_lock = false; + aliases += adjust_pte(mpnt, mpnt_addr, pfn, need_lock); } flush_dcache_mmap_unlock(mapping); if (aliases) @@ -194,7 +207,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) - make_coherent(mapping, vma, addr, ptep, pfn, vmf); + make_coherent(mapping, vma, addr, ptep, pfn); else if (vma->vm_flags & VM_EXEC) __flush_icache_all(); } diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 89f1c97f3079c..748698e91a4b4 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -436,7 +436,7 @@ void __arm_iomem_set_ro(void __iomem *ptr, size_t size) set_memory_ro((unsigned long)ptr, PAGE_ALIGN(size) / PAGE_SIZE); } -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (__force void *)arch_ioremap_caller(phys_addr, size, MT_MEMORY_RW, diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 1a8f6914ee59d..d638cc87807e2 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -248,7 +248,7 @@ void __iomem *pci_remap_cfgspace(resource_size_t res_cookie, size_t size) EXPORT_SYMBOL_GPL(pci_remap_cfgspace); #endif -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (void *)phys_addr; } diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index f5f790c6e5f89..54a90b5652858 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), false); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 8a306bbec4a0c..cb044bfd145d1 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include hostprogs := vdsomunge diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S index 9bfa0f52923c8..7c08371f44002 100644 --- a/arch/arm/vdso/vdso.lds.S +++ b/arch/arm/vdso/vdso.lds.S @@ -11,16 +11,16 @@ */ #include -#include #include #include +#include OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE(_vdso_data = . - VDSO_DATA_SIZE); + VDSO_VVAR_SYMS . = SIZEOF_HEADERS; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 940343beb3d4c..c9fe3e7566a62 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,6 +162,7 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT @@ -1302,6 +1303,15 @@ config NVIDIA_CARMEL_CNP_ERRATUM If unsure, say Y. +config ROCKCHIP_ERRATUM_3568002 + bool "Rockchip 3568002: GIC600 can not access physical addresses higher than 4GB" + default y + help + The Rockchip RK3566 and RK3568 GIC600 SoC integrations have AXI + addressing limited to the first 32bit of physical address space. + + If unsure, say Y. + config ROCKCHIP_ERRATUM_3588001 bool "Rockchip 3588001: GIC600 can not support shareability attributes" default y diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts index e4517f47d519c..eb9470a00e549 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts @@ -226,7 +226,6 @@ }; &uart5 { - pinctrl-0 = <&uart5_xfer>; rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index ae050cc6cd050..e80412abec081 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -396,6 +396,12 @@ status = "okay"; }; +&uart5 { + /delete-property/ dmas; + /delete-property/ dma-names; + pinctrl-0 = <&uart5_xfer>; +}; + /* Mule UCAN */ &usb_host0_ehci { status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts index 67c246ad8b8c0..ec2ce894da1fc 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts @@ -17,8 +17,7 @@ &gmac2io { phy-handle = <&yt8531c>; - tx_delay = <0x19>; - rx_delay = <0x05>; + phy-mode = "rgmii-id"; status = "okay"; mdio { diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts index 324a8e951f7e4..846b931e16d21 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts @@ -15,6 +15,7 @@ &gmac2io { phy-handle = <&rtl8211e>; + phy-mode = "rgmii"; tx_delay = <0x24>; rx_delay = <0x18>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi index 4f193704e5dc2..09508e324a280 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi @@ -109,7 +109,6 @@ assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>; assigned-clock-parents = <&gmac_clk>, <&gmac_clk>; clock_in_out = "input"; - phy-mode = "rgmii"; phy-supply = <&vcc_io>; pinctrl-0 = <&rgmiim1_pins>; pinctrl-names = "default"; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi index 988e6ca32fac9..a9ea4b0daa04c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi @@ -22,11 +22,11 @@ }; /* EC turns on w/ pp900_usb_en */ - pp900_usb: pp900-ap { + pp900_usb: regulator-pp900-ap { }; /* EC turns on w/ pp900_pcie_en */ - pp900_pcie: pp900-ap { + pp900_pcie: regulator-pp900-ap { }; pp3000: regulator-pp3000 { @@ -126,7 +126,7 @@ }; /* Always on; plain and simple */ - pp3000_ap: pp3000_emmc: pp3000 { + pp3000_ap: pp3000_emmc: regulator-pp3000 { }; pp1500_ap_io: regulator-pp1500-ap-io { @@ -160,7 +160,7 @@ }; /* EC turns on w/ pp3300_usb_en_l */ - pp3300_usb: pp3300 { + pp3300_usb: regulator-pp3300 { }; /* gpio is shared with pp1800_pcie and pinctrl is set there */ diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index 19b23b4389658..5e068377a0a28 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -92,7 +92,7 @@ }; /* EC turns on pp1800_s3_en */ - pp1800_s3: pp1800 { + pp1800_s3: regulator-pp1800 { }; /* pp3300 children, sorted by name */ @@ -109,11 +109,11 @@ }; /* EC turns on pp3300_s0_en */ - pp3300_s0: pp3300 { + pp3300_s0: regulator-pp3300 { }; /* EC turns on pp3300_s3_en */ - pp3300_s3: pp3300 { + pp3300_s3: regulator-pp3300 { }; /* diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 6d9e60b01225e..7eca1da78cffa 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -189,39 +189,39 @@ }; /* EC turns on w/ pp900_ddrpll_en */ - pp900_ddrpll: pp900-ap { + pp900_ddrpll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pll_en */ - pp900_pll: pp900-ap { + pp900_pll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pmu_en */ - pp900_pmu: pp900-ap { + pp900_pmu: regulator-pp900-ap { }; /* EC turns on w/ pp1800_s0_en_l */ - pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: pp1800 { + pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: regulator-pp1800 { }; /* EC turns on w/ pp1800_avdd_en_l */ - pp1800_avdd: pp1800 { + pp1800_avdd: regulator-pp1800 { }; /* EC turns on w/ pp1800_lid_en_l */ - pp1800_lid: pp1800_mic: pp1800 { + pp1800_lid: pp1800_mic: regulator-pp1800 { }; /* EC turns on w/ lpddr_pwr_en */ - pp1800_lpddr: pp1800 { + pp1800_lpddr: regulator-pp1800 { }; /* EC turns on w/ pp1800_pmu_en_l */ - pp1800_pmu: pp1800 { + pp1800_pmu: regulator-pp1800 { }; /* EC turns on w/ pp1800_usb_en_l */ - pp1800_usb: pp1800 { + pp1800_usb: regulator-pp1800 { }; pp3000_sd_slot: regulator-pp3000-sd-slot { @@ -259,11 +259,11 @@ }; /* EC turns on w/ pp3300_trackpad_en_l */ - pp3300_trackpad: pp3300-trackpad { + pp3300_trackpad: regulator-pp3300-trackpad { }; /* EC turns on w/ usb_a_en */ - pp5000_usb_a_vbus: pp5000 { + pp5000_usb_a_vbus: regulator-pp5000 { }; ap_rtc_clk: ap-rtc-clk { diff --git a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi index e553906291140..4e730aecf84df 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi @@ -284,6 +284,18 @@ mbi-alias = <0x0 0xfd410000>; mbi-ranges = <296 24>; msi-controller; + ranges; + #address-cells = <2>; + #size-cells = <2>; + dma-noncoherent; + + its: msi-controller@fd440000 { + compatible = "arm,gic-v3-its"; + reg = <0x0 0xfd440000 0 0x20000>; + dma-noncoherent; + msi-controller; + #msi-cells = <1>; + }; }; usb_host0_ehci: usb@fd800000 { @@ -957,7 +969,7 @@ num-ib-windows = <6>; num-ob-windows = <2>; max-link-speed = <2>; - msi-map = <0x0 &gic 0x0 0x1000>; + msi-map = <0x0 &its 0x0 0x1000>; num-lanes = <1>; phys = <&combphy2 PHY_TYPE_PCIE>; phy-names = "pcie-phy"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index 8cfa30837ce72..c3abdfb04f8f4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -549,10 +549,10 @@ mmu600_pcie: iommu@fc900000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfc900000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; }; @@ -560,10 +560,10 @@ mmu600_php: iommu@fcb00000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfcb00000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; status = "disabled"; @@ -2668,9 +2668,9 @@ rockchip,hw-tshut-temp = <120000>; rockchip,hw-tshut-mode = <0>; /* tshut mode 0:CRU 1:GPIO */ rockchip,hw-tshut-polarity = <0>; /* tshut polarity 0:LOW 1:HIGH */ - pinctrl-0 = <&tsadc_gpio_func>; - pinctrl-1 = <&tsadc_shut>; - pinctrl-names = "gpio", "otpout"; + pinctrl-0 = <&tsadc_shut_org>; + pinctrl-1 = <&tsadc_gpio_func>; + pinctrl-names = "default", "sleep"; #thermal-sensor-cells = <1>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts index 92f0ed83c9902..bc6b43a771537 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts @@ -113,7 +113,7 @@ compatible = "regulator-fixed"; regulator-name = "vcc3v3_lcd"; enable-active-high; - gpio = <&gpio1 RK_PC4 GPIO_ACTIVE_HIGH>; + gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&lcdpwr_en>; vin-supply = <&vcc3v3_sys>; @@ -241,7 +241,7 @@ &pinctrl { lcd { lcdpwr_en: lcdpwr-en { - rockchip,pins = <1 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; + rockchip,pins = <0 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; }; bl_en: bl-en { diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi index 4a950907ea6f5..840b638af1c24 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi @@ -213,7 +213,6 @@ interrupt-names = "sys", "pmc", "msg", "legacy", "err", "dma0", "dma1", "dma2", "dma3"; max-link-speed = <3>; - iommus = <&mmu600_pcie 0x0000>; num-lanes = <4>; phys = <&pcie30phy>; phy-names = "pcie-phy"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso index 672d748fcc67c..f229cb49da680 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso @@ -23,3 +23,7 @@ vpcie3v3-supply = <&vcc3v3_pcie30>; status = "okay"; }; + +&mmu600_pcie { + status = "disabled"; +}; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index cb7da44155999..1f25423de3833 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1551,6 +1551,8 @@ CONFIG_PWM_VISCONTI=m CONFIG_SL28CPLD_INTC=y CONFIG_QCOM_PDC=y CONFIG_QCOM_MPM=y +CONFIG_TI_SCI_INTR_IRQCHIP=y +CONFIG_TI_SCI_INTA_IRQCHIP=y CONFIG_RESET_GPIO=m CONFIG_RESET_IMX7=y CONFIG_RESET_QCOM_AOSS=y diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 06a4670bdb0b9..99cd6546e72e3 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -35,7 +35,7 @@ #define ARCH_DMA_MINALIGN (128) #define ARCH_KMALLOC_MINALIGN (8) -#ifndef __ASSEMBLY__ +#if !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) #include #include @@ -118,6 +118,6 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void) return ctr; } -#endif /* __ASSEMBLY__ */ +#endif /* !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) */ #endif diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 25e1626517500..555c613fd2324 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -16,6 +16,32 @@ #include #include +.macro init_el2_hcr val + mov_q x0, \val + + /* + * Compliant CPUs advertise their VHE-onlyness with + * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it + * can reset into an UNKNOWN state and might not read as 1 until it has + * been initialized explicitly. + * + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but + * don't advertise it (they predate this relaxation). + * + * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H + * indicating whether the CPU is running in E2H mode. + */ + mrs_s x1, SYS_ID_AA64MMFR4_EL1 + sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH + cmp x1, #0 + b.ge .LnVHE_\@ + + orr x0, x0, #HCR_E2H +.LnVHE_\@: + msr hcr_el2, x0 + isb +.endm + .macro __init_el2_sctlr mov_q x0, INIT_SCTLR_EL2_MMU_OFF msr sctlr_el2, x0 @@ -244,11 +270,6 @@ .Lskip_gcs_\@: .endm -.macro __init_el2_nvhe_prepare_eret - mov x0, #INIT_PSTATE_EL1 - msr spsr_el2, x0 -.endm - .macro __init_el2_mpam /* Memory Partitioning And Monitoring: disable EL2 traps */ mrs x1, id_aa64pfr0_el1 diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6dff3e69539b..07fbf5bf85a7e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,8 +42,8 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -76,12 +76,22 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, { unsigned long stride = huge_page_size(hstate_vma(vma)); - if (stride == PMD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 2); - else if (stride == PUD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 1); - else - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); + switch (stride) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + __flush_tlb_range(vma, start, end, PUD_SIZE, false, 1); + break; +#endif + case CONT_PMD_SIZE: + case PMD_SIZE: + __flush_tlb_range(vma, start, end, PMD_SIZE, false, 2); + break; + case CONT_PTE_SIZE: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 3); + break; + default: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + } } #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8d94a6c0ed5c4..c2417a424b98d 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -119,7 +119,7 @@ #define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK #define TCR_EL2_T0SZ_MASK 0x3f #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ - TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_DS TCR_EL2_DS diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef1238..d919557af5e50 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1259,7 +1259,7 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h index 3e3c3fdb18427..61679070f595c 100644 --- a/arch/arm64/include/asm/vdso.h +++ b/arch/arm64/include/asm/vdso.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_H #define __ASM_VDSO_H -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index 778c1202bbbf9..d60ea7a72a9cb 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -104,7 +104,7 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { u64 res; @@ -131,45 +131,33 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - const struct vdso_data *ret; + const struct vdso_time_data *ret; /* - * This simply puts &_vdso_data into ret. The reason why we don't use - * `ret = _vdso_data` is that the compiler tends to optimise this in a - * very suboptimal way: instead of keeping &_vdso_data in a register, - * it goes through a relocation almost every time _vdso_data must be + * This simply puts &_vdso_time_data into ret. The reason why we don't use + * `ret = _vdso_time_data` is that the compiler tends to optimise this in a + * very suboptimal way: instead of keeping &_vdso_time_data in a register, + * it goes through a relocation almost every time _vdso_time_data must be * accessed (even in subfunctions). This is both time and space * consuming: each relocation uses a word in the code section, and it * has to be loaded at runtime. * * This trick hides the assignment from the compiler. Since it cannot * track where the pointer comes from, it will only use one relocation - * where __arch_get_vdso_data() is called, and then keep the result in - * a register. + * where __aarch64_get_vdso_u_time_data() is called, and then keep the + * result in a register. */ - asm volatile("mov %0, %1" : "=r"(ret) : "r"(_vdso_data)); + asm volatile("mov %0, %1" : "=r"(ret) : "r"(&vdso_u_time_data)); return ret; } +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - const struct vdso_data *ret; - - /* See __arch_get_vdso_data(). */ - asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data)); - - return ret; -} -#endif - -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { - return vd->clock_mode == VDSO_CLOCKMODE_ARCHTIMER; + return vc->clock_mode == VDSO_CLOCKMODE_ARCHTIMER; } #define vdso_clocksource_ok vdso_clocksource_ok diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h index 342f807e20444..a2197da1951b0 100644 --- a/arch/arm64/include/asm/vdso/getrandom.h +++ b/arch/arm64/include/asm/vdso/getrandom.h @@ -33,18 +33,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - /* - * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace - * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ - * PAGE_OFFSET points to the real VVAR page. - */ - if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT); - return &_vdso_rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 764d13e2916c5..92a2b59a9f3df 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -67,7 +67,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { u64 res; @@ -99,20 +99,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } -static __always_inline -const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h index eea51946d45a2..de58951b8df6a 100644 --- a/arch/arm64/include/asm/vdso/vsyscall.h +++ b/arch/arm64/include/asm/vdso/vsyscall.h @@ -2,44 +2,21 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 480 - #ifndef __ASSEMBLY__ #include -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - #define VDSO_PRECISION_MASK ~(0xFF00ULL<<48) -extern struct vdso_data *vdso_data; /* * Update the vDSO data page to keep in sync with kernel timekeeping. */ static __always_inline -struct vdso_data *__arm64_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __arm64_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data - -static __always_inline -void __arm64_update_vsyscall(struct vdso_data *vdata) +void __arm64_update_vsyscall(struct vdso_time_data *vdata) { - vdata[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; - vdata[CS_RAW].mask = VDSO_PRECISION_MASK; + vdata->clock_data[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; + vdata->clock_data[CS_RAW].mask = VDSO_PRECISION_MASK; } #define __arch_update_vsyscall __arm64_update_vsyscall diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 5ab1970ee5436..2ce73525de2c9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -298,25 +298,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el2, x0 isb 0: - mov_q x0, HCR_HOST_NVHE_FLAGS - - /* - * Compliant CPUs advertise their VHE-onlyness with - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be - * RES1 in that case. Publish the E2H bit early so that - * it can be picked up by the init_el2_state macro. - * - * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but - * don't advertise it (they predate this relaxation). - */ - mrs_s x1, SYS_ID_AA64MMFR4_EL1 - tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f - - orr x0, x0, #HCR_E2H -1: - msr hcr_el2, x0 - isb + init_el2_hcr HCR_HOST_NVHE_FLAGS init_el2_state /* Hypervisor stub */ @@ -339,7 +322,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el1, x1 mov x2, xzr 3: - __init_el2_nvhe_prepare_eret + mov x0, #INIT_PSTATE_EL1 + msr spsr_el2, x0 mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8ed8e5b713b5..887ac0b05961e 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -57,12 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { #endif /* CONFIG_COMPAT_VDSO */ }; -/* - * The vDSO data page. - */ -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -104,78 +98,6 @@ static int __init __vdso_init(enum vdso_abi abi) return 0; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static const struct vm_special_mapping vvar_map; - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_map)) - zap_vma_pages(vma); - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static const struct vm_special_mapping vvar_map = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -185,11 +107,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -197,16 +119,14 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - &vvar_map); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; if (system_supports_bti_kernel()) gp_flags = VM_ARM64_BTI; - vdso_base += VVAR_NR_PAGES * PAGE_SIZE; + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 35685c0360441..5e27e46aa4967 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -7,7 +7,7 @@ # # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 47ad6944f9f08..52314be291912 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -20,11 +20,8 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 25a2cb6317f35..f2dfdc7dc8185 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,7 +3,7 @@ # Makefile for vdso32 # -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 732702a187e9e..e02b27487ce80 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -12,16 +12,15 @@ #include #include #include +#include OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 70802e4c91cf5..5133dcbfe9f76 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1070,8 +1070,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; - hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ctxt->hrtimer.function = kvm_hrtimer_expire; + hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); switch (timerid) { case TIMER_PTIMER: @@ -1098,8 +1097,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) timer_set_offset(vcpu_ptimer(vcpu), 0); } - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - timer->bg_timer.function = kvm_bg_timer_expire; + hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); } void kvm_timer_init_vm(struct kvm *kvm) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282f..0160b49243511 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -559,6 +559,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); + /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has @@ -1138,18 +1148,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - /* - * The VMID allocator only tracks active VMIDs per - * physical CPU, and therefore the VMID allocated may not be - * preserved on VMID roll-over if the task was preempted, - * making a thread's VMID inactive. So we need to call - * kvm_arm_vmid_update() in non-premptible context. - */ - if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && - has_vhe()) - __load_stage2(vcpu->arch.hw_mmu, - vcpu->arch.hw_mmu->arch); - kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); @@ -1980,7 +1978,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - unsigned long tcr, ips; + unsigned long tcr; /* * Calculate the raw per-cpu offset without a translation from the @@ -1994,19 +1992,18 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); tcr = read_sysreg(tcr_el1); - ips = FIELD_GET(TCR_IPS_MASK, tcr); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); tcr |= TCR_EPD1_MASK; } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + tcr &= TCR_EL2_MASK; - tcr |= TCR_EL2_RES1; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; } - tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); - tcr &= ~TCR_EL2_PS_MASK; - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips); - if (lpa2_is_enabled()) - tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index fc18662260676..f8af11189572f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -73,8 +73,12 @@ __do_hyp_init: eret SYM_CODE_END(__kvm_hyp_init) +/* + * Initialize EL2 CPU state to sane values. + * + * HCR_EL2.E2H must have been initialized already. + */ SYM_CODE_START_LOCAL(__kvm_init_el2_state) - /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 finalise_el2_state ret @@ -206,9 +210,9 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} - bl __kvm_init_el2_state + init_el2_hcr 0 - __init_el2_nvhe_prepare_eret + bl __kvm_init_el2_state /* Enable MMU, set vectors and stack. */ mov x0, x28 diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 9c2ce1e0e99a5..c3e196fb8b18f 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -218,6 +218,9 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on) if (is_cpu_on) release_boot_args(boot_args); + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); + __host_enter(host_ctxt); } diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 806223b7022af..7fe8ba1a2851c 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -135,11 +135,10 @@ void kvm_arm_vmid_clear_active(void) atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; - bool updated = false; vmid = atomic64_read(&kvm_vmid->id); @@ -157,21 +156,17 @@ bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) - return false; + return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); - if (!vmid_gen_match(vmid)) { + if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); - updated = true; - } atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); - - return updated; } /* diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 98a2a0e64e255..b3a7fafe8892d 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -100,20 +100,11 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, static inline int num_contig_ptes(unsigned long size, size_t *pgsize) { - int contig_ptes = 0; + int contig_ptes = 1; *pgsize = size; switch (size) { -#ifndef __PAGETABLE_PMD_FOLDED - case PUD_SIZE: - if (pud_sect_supported()) - contig_ptes = 1; - break; -#endif - case PMD_SIZE: - contig_ptes = 1; - break; case CONT_PMD_SIZE: *pgsize = PMD_SIZE; contig_ptes = CONT_PMDS; @@ -122,6 +113,8 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) *pgsize = PAGE_SIZE; contig_ptes = CONT_PTES; break; + default: + WARN_ON(!__hugetlb_valid_size(size)); } return contig_ptes; @@ -163,24 +156,23 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = __ptep_get(ptep); - unsigned long i; - - for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = __ptep_get_and_clear(mm, addr, ptep); - - /* - * If HW_AFDBM is enabled, then the HW could turn on - * the dirty or accessed bit for any page in the set, - * so check them all. - */ - if (pte_dirty(pte)) - orig_pte = pte_mkdirty(orig_pte); - - if (pte_young(pte)) - orig_pte = pte_mkyoung(orig_pte); + pte_t pte, tmp_pte; + bool present; + + pte = __ptep_get_and_clear(mm, addr, ptep); + present = pte_present(pte); + while (--ncontig) { + ptep++; + addr += pgsize; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (present) { + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } } - return orig_pte; + return pte; } static pte_t get_clear_contig_flush(struct mm_struct *mm, @@ -396,18 +388,13 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, __pte_clear(mm, addr, ptep); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) { int ncontig; size_t pgsize; - pte_t orig_pte = __ptep_get(ptep); - - if (!pte_cont(orig_pte)) - return __ptep_get_and_clear(mm, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(sz, &pgsize); return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } @@ -549,6 +536,8 @@ bool __init arch_hugetlb_valid_size(unsigned long size) pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long psize = huge_page_size(hstate_vma(vma)); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings @@ -558,7 +547,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9c0b8d9558fc4..ccdef53872a0b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -279,12 +279,7 @@ void __init arm64_memblock_init(void) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; - - /* - * Use the sanitised version of id_aa64mmfr0_el1 so that linear - * map randomization can be enabled by shrinking the IPA space. - */ - u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); int parange = cpuid_feature_extract_unsigned_field( mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); s64 range = linear_region_size - diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile index 069ef0b17fe52..a3042287a0707 100644 --- a/arch/csky/kernel/vdso/Makefile +++ b/arch/csky/kernel/vdso/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Symbols present in the vdso vdso-syms += rt_sigreturn diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 2b8bd27a852fe..d7ddf2a43e63b 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -30,6 +30,7 @@ config LOONGARCH select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VDSO_ARCH_DATA select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION @@ -106,6 +107,7 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index c8e4057734d0d..4dc4b3e04225f 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -36,7 +36,8 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = ptep_get(ptep); @@ -51,8 +52,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/loongarch/include/asm/vdso.h b/arch/loongarch/include/asm/vdso.h index d3ba35eb23e77..f72ec79e2dde5 100644 --- a/arch/loongarch/include/asm/vdso.h +++ b/arch/loongarch/include/asm/vdso.h @@ -31,7 +31,6 @@ struct loongarch_vdso_info { unsigned long size; unsigned long offset_sigreturn; struct vm_special_mapping code_mapping; - struct vm_special_mapping data_mapping; }; extern struct loongarch_vdso_info vdso_info; diff --git a/arch/loongarch/include/asm/vdso/arch_data.h b/arch/loongarch/include/asm/vdso/arch_data.h new file mode 100644 index 0000000000000..322d0a5f1c844 --- /dev/null +++ b/arch/loongarch/include/asm/vdso/arch_data.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Author: Huacai Chen + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ + +#ifndef _VDSO_ARCH_DATA_H +#define _VDSO_ARCH_DATA_H + +#ifndef __ASSEMBLY__ + +#include +#include + +struct vdso_pcpu_data { + u32 node; +} ____cacheline_aligned_in_smp; + +struct vdso_arch_data { + struct vdso_pcpu_data pdata[NR_CPUS]; +}; + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h index e80f3c4ac7481..48c43f55b039b 100644 --- a/arch/loongarch/include/asm/vdso/getrandom.h +++ b/arch/loongarch/include/asm/vdso/getrandom.h @@ -28,11 +28,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - return &_loongarch_data.rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index 7eb3f041af764..88cfcf1331163 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -72,7 +72,7 @@ static __always_inline int clock_getres_fallback( } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { uint64_t count; @@ -89,18 +89,6 @@ static inline bool loongarch_vdso_hres_capable(void) } #define __arch_vdso_hres_capable loongarch_vdso_hres_capable -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h index 1c183a9b2115a..50c65fb29dafb 100644 --- a/arch/loongarch/include/asm/vdso/vdso.h +++ b/arch/loongarch/include/asm/vdso/vdso.h @@ -12,43 +12,9 @@ #include #include #include +#include -struct vdso_pcpu_data { - u32 node; -} ____cacheline_aligned_in_smp; - -struct loongarch_vdso_data { - struct vdso_pcpu_data pdata[NR_CPUS]; - struct vdso_rng_data rng_data; -}; - -/* - * The layout of vvar: - * - * high - * +---------------------+--------------------------+ - * | loongarch vdso data | LOONGARCH_VDSO_DATA_SIZE | - * +---------------------+--------------------------+ - * | time-ns vdso data | PAGE_SIZE | - * +---------------------+--------------------------+ - * | generic vdso data | PAGE_SIZE | - * +---------------------+--------------------------+ - * low - */ -#define LOONGARCH_VDSO_DATA_SIZE PAGE_ALIGN(sizeof(struct loongarch_vdso_data)) -#define LOONGARCH_VDSO_DATA_PAGES (LOONGARCH_VDSO_DATA_SIZE >> PAGE_SHIFT) - -enum vvar_pages { - VVAR_GENERIC_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_LOONGARCH_PAGES_START, - VVAR_LOONGARCH_PAGES_END = VVAR_LOONGARCH_PAGES_START + LOONGARCH_VDSO_DATA_PAGES - 1, - VVAR_NR_PAGES, -}; - -#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) - -extern struct loongarch_vdso_data _loongarch_data __attribute__((visibility("hidden"))); +#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) #endif /* __ASSEMBLY__ */ diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h index 8987e951d0a93..1140b54b4bc82 100644 --- a/arch/loongarch/include/asm/vdso/vsyscall.h +++ b/arch/loongarch/include/asm/vdso/vsyscall.h @@ -6,23 +6,6 @@ #include -extern struct vdso_data *vdso_data; -extern struct vdso_rng_data *vdso_rng_data; - -static __always_inline -struct vdso_data *__loongarch_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __loongarch_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void) -{ - return vdso_rng_data; -} -#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index 382a09a7152c3..1120ac2824f6e 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -249,18 +249,6 @@ static __init int setup_node(int pxm) return acpi_map_pxm_to_node(pxm); } -/* - * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for - * I/O localities since SRAT does not list them. I/O localities are - * not supported at this point. - */ -unsigned int numa_distance_cnt; - -static inline unsigned int get_numa_distances_cnt(struct acpi_table_slit *slit) -{ - return slit->locality_count; -} - void __init numa_set_distance(int from, int to, int distance) { if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) { diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index 8be1c38ad8eb2..6f1524bba957e 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -315,6 +315,6 @@ static void __used output_vdso_defines(void) { COMMENT("LoongArch vDSO offsets."); - DEFINE(__VVAR_PAGES, VVAR_NR_PAGES); + DEFINE(__VDSO_PAGES, VDSO_NR_PAGES); BLANK(); } diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c index 8ae641dc53bb7..f9381800e291c 100644 --- a/arch/loongarch/kernel/machine_kexec.c +++ b/arch/loongarch/kernel/machine_kexec.c @@ -126,14 +126,14 @@ void kexec_reboot(void) /* All secondary cpus go to kexec_smp_wait */ if (smp_processor_id() > 0) { relocated_kexec_smp_wait(NULL); - unreachable(); + BUG(); } #endif do_kexec = (void *)reboot_code_buffer; do_kexec(efi_boot, cmdline_ptr, systable_ptr, start_addr, first_ind_entry); - unreachable(); + BUG(); } diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index edcfdfcad7d22..90cb3ca96f085 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -387,6 +387,9 @@ static void __init check_kernel_sections_mem(void) */ static void __init arch_mem_init(char **cmdline_p) { + /* Recalculate max_low_pfn for "mem=xxx" */ + max_pfn = max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); + if (usermem) pr_info("User-defined physical RAM map overwrite\n"); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index fbf747447f13f..4b24589c0b565 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -423,7 +424,7 @@ void loongson_cpu_die(unsigned int cpu) mb(); } -void __noreturn arch_cpu_idle_dead(void) +static void __noreturn idle_play_dead(void) { register uint64_t addr; register void (*init_fn)(void); @@ -447,6 +448,50 @@ void __noreturn arch_cpu_idle_dead(void) BUG(); } +#ifdef CONFIG_HIBERNATION +static void __noreturn poll_play_dead(void) +{ + register uint64_t addr; + register void (*init_fn)(void); + + idle_task_exit(); + __this_cpu_write(cpu_state, CPU_DEAD); + + __smp_mb(); + do { + __asm__ __volatile__("nop\n\t"); + addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0); + } while (addr == 0); + + init_fn = (void *)TO_CACHE(addr); + iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR); + + init_fn(); + BUG(); +} +#endif + +static void (*play_dead)(void) = idle_play_dead; + +void __noreturn arch_cpu_idle_dead(void) +{ + play_dead(); + BUG(); /* play_dead() doesn't return */ +} + +#ifdef CONFIG_HIBERNATION +int hibernate_resume_nonboot_cpu_disable(void) +{ + int ret; + + play_dead = poll_play_dead; + ret = suspend_disable_secondary_cpus(); + play_dead = idle_play_dead; + + return ret; +} +#endif + #endif /* diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 05e5fbac102a9..10cf1608c7b32 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -25,18 +25,6 @@ extern char vdso_start[], vdso_end[]; -/* Kernel-provided data used by the VDSO. */ -static union vdso_data_store generic_vdso_data __page_aligned_data; - -static union { - u8 page[LOONGARCH_VDSO_DATA_SIZE]; - struct loongarch_vdso_data vdata; -} loongarch_vdso_data __page_aligned_data; - -struct vdso_data *vdso_data = generic_vdso_data.data; -struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata; -struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.vdso = (void *)(new_vma->vm_start); @@ -44,53 +32,12 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc return 0; } -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - unsigned long pfn; - struct page *timens_page = find_timens_vvar_page(vma); - - switch (vmf->pgoff) { - case VVAR_GENERIC_PAGE_OFFSET: - if (!timens_page) - pfn = sym_to_pfn(vdso_data); - else - pfn = page_to_pfn(timens_page); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace specific - * VVAR is mapped with the VVAR_GENERIC_PAGE_OFFSET and the real - * VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - else - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - case VVAR_LOONGARCH_PAGES_START ... VVAR_LOONGARCH_PAGES_END: - pfn = sym_to_pfn(&loongarch_vdso_data) + vmf->pgoff - VVAR_LOONGARCH_PAGES_START; - break; - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - struct loongarch_vdso_info vdso_info = { .vdso = vdso_start, .code_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }, - .data_mapping = { - .name = "[vvar]", - .fault = vvar_fault, - }, .offset_sigreturn = vdso_offset_sigreturn, }; @@ -101,7 +48,7 @@ static int __init init_vdso(void) BUG_ON(!PAGE_ALIGNED(vdso_info.vdso)); for_each_possible_cpu(cpu) - vdso_pdata[cpu].node = cpu_to_node(cpu); + vdso_k_arch_data->pdata[cpu].node = cpu_to_node(cpu); vdso_info.size = PAGE_ALIGN(vdso_end - vdso_start); vdso_info.code_mapping.pages = @@ -115,37 +62,6 @@ static int __init init_vdso(void) } subsys_initcall(init_vdso); -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a - * task changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vdso_info.data_mapping)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - static unsigned long vdso_base(void) { unsigned long base = STACK_TOP; @@ -181,9 +97,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; } - vma = _install_special_mapping(mm, data_addr, VVAR_SIZE, - VM_READ | VM_MAYREAD | VM_PFNMAP, - &info->data_mapping); + vma = vdso_install_vvar_mapping(mm, data_addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index c1e8ec5b941b2..ea321403644ad 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -669,6 +669,12 @@ static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write) struct kvm_run *run = vcpu->run; unsigned long badv = vcpu->arch.badv; + /* Inject ADE exception if exceed max GPA size */ + if (unlikely(badv >= vcpu->kvm->arch.gpa_size)) { + kvm_queue_exception(vcpu, EXCCODE_ADE, EXSUBCODE_ADEM); + return RESUME_GUEST; + } + ret = kvm_handle_mm_fault(vcpu, badv, write); if (ret) { /* Treat as MMIO */ diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index f6d3242b9234a..b6864d6e5ec8d 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -317,6 +317,13 @@ int kvm_arch_enable_virtualization_cpu(void) kvm_debug("GCFG:%lx GSTAT:%lx GINTC:%lx GTLBC:%lx", read_csr_gcfg(), read_csr_gstat(), read_csr_gintc(), read_csr_gtlbc()); + /* + * HW Guest CSR registers are lost after CPU suspend and resume. + * Clear last_vcpu so that Guest CSR registers forced to reload + * from vCPU SW state. + */ + this_cpu_ptr(vmcs)->last_vcpu = NULL; + return 0; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 20f941af3e9ea..dbf5d7ae39718 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -311,7 +311,7 @@ static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) { int ret = RESUME_GUEST; unsigned long estat = vcpu->arch.host_estat; - u32 intr = estat & 0x1fff; /* Ignore NMI */ + u32 intr = estat & CSR_ESTAT_IS; u32 ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; vcpu->mode = OUTSIDE_GUEST_MODE; @@ -1459,8 +1459,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.vpid = 0; vcpu->arch.flush_gpa = INVALID_GPA; - hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - vcpu->arch.swtimer.function = kvm_swtimer_wakeup; + hrtimer_setup(&vcpu->arch.swtimer, kvm_swtimer_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); vcpu->arch.handle_exit = kvm_handle_exit; vcpu->arch.guest_eentry = (unsigned long)kvm_loongarch_ops->exc_entry; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index b8b3e1972d6ea..edccfc8c9cd80 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -48,7 +48,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (kvm_pvtime_supported()) kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); - kvm->arch.gpa_size = BIT(cpu_vabits - 1); + /* + * cpu_vabits means user address space only (a half of total). + * GPA size of VM is the same with the size of user address space. + */ + kvm->arch.gpa_size = BIT(cpu_vabits); kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1; kvm->arch.invalid_ptes[0] = 0; kvm->arch.invalid_ptes[1] = (unsigned long)invalid_pte_table; diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index 914e82ff3f656..1df9e99582cc6 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -3,6 +3,7 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include +#include #include #include #include @@ -63,8 +64,11 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } info.length = len; - info.align_mask = do_color_align ? (PAGE_MASK & SHM_ALIGN_MASK) : 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp && is_file_hugepages(filp)) + info.align_mask = huge_page_mask_align(filp); + else + info.align_mask = do_color_align ? (PAGE_MASK & SHM_ALIGN_MASK) : 0; if (dir == DOWN) { info.flags = VM_UNMAPPED_AREA_TOPDOWN; diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index fdde1bcd4e266..1c26147aff701 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -2,7 +2,7 @@ # Objects to go into the VDSO. # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \ vgetrandom-chacha.o sigreturn.o diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S index 160cfaef2de45..8ff9864999475 100644 --- a/arch/loongarch/vdso/vdso.lds.S +++ b/arch/loongarch/vdso/vdso.lds.S @@ -5,6 +5,7 @@ */ #include #include +#include OUTPUT_FORMAT("elf64-loongarch", "elf64-loongarch", "elf64-loongarch") @@ -12,11 +13,8 @@ OUTPUT_ARCH(loongarch) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - PROVIDE(_loongarch_data = _vdso_data + 2 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c index 0db51258b2a7c..5301cd9d0f839 100644 --- a/arch/loongarch/vdso/vgetcpu.c +++ b/arch/loongarch/vdso/vgetcpu.c @@ -19,27 +19,19 @@ static __always_inline int read_cpu_id(void) return cpu_id; } -static __always_inline const struct vdso_pcpu_data *get_pcpu_data(void) -{ - return _loongarch_data.pdata; -} - extern int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused) { int cpu_id; - const struct vdso_pcpu_data *data; cpu_id = read_cpu_id(); if (cpu) *cpu = cpu_id; - if (node) { - data = get_pcpu_data(); - *node = data[cpu_id].node; - } + if (node) + *node = vdso_u_arch_data.pdata[cpu_id].node; return 0; } diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index f1ae4ed890db5..80afc3a187249 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -44,8 +44,10 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm) pgd_t *new_pgd; new_pgd = __pgd_alloc(mm, 0); - memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE); - memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT)); + if (likely(new_pgd != NULL)) { + memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE); + memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT)); + } return new_pgd; } diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 1924f2d839323..b51168e319ea3 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,6 +51,7 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H diff --git a/arch/mips/boot/tools/relocs.c b/arch/mips/boot/tools/relocs.c index a88d66c46d7f7..9863e1d5c62e3 100644 --- a/arch/mips/boot/tools/relocs.c +++ b/arch/mips/boot/tools/relocs.c @@ -468,6 +468,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { int i; + struct section *extab_sec = sec_lookup("__ex_table"); + int extab_index = extab_sec ? extab_sec - secs : -1; /* Walk through the relocations */ for (i = 0; i < ehdr.e_shnum; i++) { @@ -480,6 +482,9 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, if (sec->shdr.sh_type != SHT_REL_TYPE) continue; + if (sec->shdr.sh_info == extab_index) + continue; + sec_symtab = sec->link; sec_applies = &secs[sec->shdr.sh_info]; if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index d0a86ce83de91..fbc71ddcf0f68 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -27,7 +27,8 @@ static inline int prepare_hugepage_range(struct file *file, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = *ptep; @@ -42,13 +43,14 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index 44a45f3fa4b01..fd32baa30e172 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -167,7 +167,7 @@ static __always_inline u64 read_r4k_count(void) #ifdef CONFIG_CLKSRC_MIPS_GIC -static __always_inline u64 read_gic_count(const struct vdso_data *data) +static __always_inline u64 read_gic_count(const struct vdso_time_data *data) { void __iomem *gic = get_gic(data); u32 hi, hi2, lo; @@ -184,7 +184,7 @@ static __always_inline u64 read_gic_count(const struct vdso_data *data) #endif static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { #ifdef CONFIG_CSRC_R4K if (clock_mode == VDSO_CLOCKMODE_R4K) @@ -209,10 +209,11 @@ static inline bool mips_vdso_hres_capable(void) } #define __arch_vdso_hres_capable mips_vdso_hres_capable -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return get_vdso_data(); + return get_vdso_time_data(); } +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data #endif /* !__ASSEMBLY__ */ diff --git a/arch/mips/include/asm/vdso/vdso.h b/arch/mips/include/asm/vdso/vdso.h index 6cd88191fefa9..acd0efcd3d93e 100644 --- a/arch/mips/include/asm/vdso/vdso.h +++ b/arch/mips/include/asm/vdso/vdso.h @@ -5,16 +5,18 @@ */ #include +#include + +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ #include -#include #include -static inline unsigned long get_vdso_base(void) +static inline const struct vdso_time_data *get_vdso_time_data(void) { - unsigned long addr; + const struct vdso_time_data *addr; /* * We can't use cpu_has_mips_r6 since it needs the cpu_data[] @@ -27,7 +29,7 @@ static inline unsigned long get_vdso_base(void) * We can't use addiupc because there is no label-label * support for the addiupc reloc */ - __asm__("lapc %0, _start \n" + __asm__("lapc %0, vdso_u_time_data \n" : "=r" (addr) : :); #else /* @@ -46,7 +48,7 @@ static inline unsigned long get_vdso_base(void) " .set noreorder \n" " bal 1f \n" " nop \n" - " .word _start - . \n" + " .word vdso_u_time_data - . \n" "1: lw %0, 0($31) \n" " " STR(PTR_ADDU) " %0, $31, %0 \n" " .set pop \n" @@ -58,14 +60,9 @@ static inline unsigned long get_vdso_base(void) return addr; } -static inline const struct vdso_data *get_vdso_data(void) -{ - return (const struct vdso_data *)(get_vdso_base() - PAGE_SIZE); -} - #ifdef CONFIG_CLKSRC_MIPS_GIC -static inline void __iomem *get_gic(const struct vdso_data *data) +static inline void __iomem *get_gic(const struct vdso_time_data *data) { return (void __iomem *)((unsigned long)data & PAGE_MASK) - PAGE_SIZE; } diff --git a/arch/mips/include/asm/vdso/vsyscall.h b/arch/mips/include/asm/vdso/vsyscall.h index a4582870aaea4..2b1debb62dee0 100644 --- a/arch/mips/include/asm/vdso/vsyscall.h +++ b/arch/mips/include/asm/vdso/vsyscall.h @@ -2,22 +2,12 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H +#include + #ifndef __ASSEMBLY__ #include -extern struct vdso_data *vdso_data; - -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -static __always_inline -struct vdso_data *__mips_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __mips_get_k_vdso_data - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 75c9d3618f588..de096777172f0 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -23,20 +24,7 @@ #include #include -/* Kernel-provided data used by the VDSO. */ -static union vdso_data_store mips_vdso_data __page_aligned_data; -struct vdso_data *vdso_data = mips_vdso_data.data; - -/* - * Mapping for the VDSO data/GIC pages. The real pages are mapped manually, as - * what we map and where within the area they are mapped is determined at - * runtime. - */ -static struct page *no_pages[] = { NULL }; -static struct vm_special_mapping vdso_vvar_mapping = { - .name = "[vvar]", - .pages = no_pages, -}; +static_assert(VDSO_NR_PAGES == __VDSO_PAGES); static void __init init_vdso_image(struct mips_vdso_image *image) { @@ -90,7 +78,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mips_vdso_image *image = current->thread.abi->vdso; struct mm_struct *mm = current->mm; - unsigned long gic_size, vvar_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base; + unsigned long gic_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base; struct vm_area_struct *vma; int ret; @@ -119,8 +107,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * the counter registers at the start. */ gic_size = mips_gic_present() ? PAGE_SIZE : 0; - vvar_size = gic_size + PAGE_SIZE; - size = vvar_size + image->size; + size = gic_size + VDSO_NR_PAGES * PAGE_SIZE + image->size; /* * Find a region that's large enough for us to perform the @@ -143,15 +130,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) */ if (cpu_has_dc_aliases) { base = __ALIGN_MASK(base, shm_align_mask); - base += ((unsigned long)vdso_data - gic_size) & shm_align_mask; + base += ((unsigned long)vdso_k_time_data - gic_size) & shm_align_mask; } data_addr = base + gic_size; - vdso_addr = data_addr + PAGE_SIZE; + vdso_addr = data_addr + VDSO_NR_PAGES * PAGE_SIZE; - vma = _install_special_mapping(mm, base, vvar_size, - VM_READ | VM_MAYREAD, - &vdso_vvar_mapping); + vma = vdso_install_vvar_mapping(mm, data_addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -161,6 +146,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (gic_size) { gic_base = (unsigned long)mips_gic_base + MIPS_GIC_USER_OFS; gic_pfn = PFN_DOWN(__pa(gic_base)); + static const struct vm_special_mapping gic_mapping = { + .name = "[gic]", + .pages = (struct page **) { NULL }, + }; + + vma = _install_special_mapping(mm, base, gic_size, VM_READ | VM_MAYREAD, + &gic_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } ret = io_remap_pfn_range(vma, base, gic_pfn, gic_size, pgprot_noncached(vma->vm_page_prot)); @@ -168,13 +164,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; } - /* Map data page. */ - ret = remap_pfn_range(vma, data_addr, - virt_to_phys(vdso_data) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot); - if (ret) - goto out; - /* Map VDSO image. */ vma = _install_special_mapping(mm, vdso_addr, image->size, VM_READ | VM_EXEC | diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 60b43ea85c125..cef3c423a41ae 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -288,9 +288,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) return err; - hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; + hrtimer_setup(&vcpu->arch.comparecount_timer, kvm_mips_comparecount_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Allocate space for host mode exception handlers that handle diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index b289b2c1b2946..fb4c493aaffa9 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -2,7 +2,7 @@ # Objects to go into the VDSO. # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso-y := elf.o vgettimeofday.o sigreturn.o diff --git a/arch/mips/vdso/vdso.lds.S b/arch/mips/vdso/vdso.lds.S index 836465e3bcb8a..c8bbe56d89cb0 100644 --- a/arch/mips/vdso/vdso.lds.S +++ b/arch/mips/vdso/vdso.lds.S @@ -5,6 +5,8 @@ */ #include +#include +#include #if _MIPS_SIM == _MIPS_SIM_ABI64 OUTPUT_FORMAT("elf64-tradlittlemips", "elf64-tradbigmips", "elf64-tradlittlemips") @@ -18,7 +20,8 @@ OUTPUT_ARCH(mips) SECTIONS { - PROVIDE(_start = .); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; /* diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 5b3a5429f71b3..21e9ace177395 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -10,7 +10,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h index 2a2dc11b5545f..5f581c1d64606 100644 --- a/arch/parisc/include/asm/vdso.h +++ b/arch/parisc/include/asm/vdso.h @@ -12,8 +12,6 @@ #define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) -extern struct vdso_data *vdso_data; - #endif /* __ASSEMBLY __ */ /* Default link addresses for the vDSOs */ diff --git a/arch/parisc/kernel/vdso32/Makefile b/arch/parisc/kernel/vdso32/Makefile index 288f8b85978fb..4ee8d17da229b 100644 --- a/arch/parisc/kernel/vdso32/Makefile +++ b/arch/parisc/kernel/vdso32/Makefile @@ -1,5 +1,5 @@ # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include KCOV_INSTRUMENT := n diff --git a/arch/parisc/kernel/vdso64/Makefile b/arch/parisc/kernel/vdso64/Makefile index bc5d9553f3112..c63f4069170f4 100644 --- a/arch/parisc/kernel/vdso64/Makefile +++ b/arch/parisc/kernel/vdso64/Makefile @@ -1,5 +1,5 @@ # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include KCOV_INSTRUMENT := n diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index e9d18cf25b792..a94fe546d434f 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -126,7 +126,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t entry; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 424f188e62d98..8a2a6e403eb1e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -159,6 +159,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN + select ARCH_HAS_VDSO_ARCH_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_EXTRA_ELF_NOTES if SPU_BASE select ARCH_KEEP_MEMBLOCK @@ -209,6 +210,7 @@ config PPC select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dad2e7980f245..86326587e58de 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -45,7 +45,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); } @@ -55,8 +56,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_hugetlb_page(vma, addr); return pte; } diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 8d972bc98b55f..1ca23fbfe087a 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_VDSO_H #define VDSO_VERSION_STRING LINUX_2.6.15 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/vdso/arch_data.h b/arch/powerpc/include/asm/vdso/arch_data.h new file mode 100644 index 0000000000000..c240a6b875181 --- /dev/null +++ b/arch/powerpc/include/asm/vdso/arch_data.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2002 Peter Bergner , IBM + * Copyright (C) 2005 Benjamin Herrenschmidy , + * IBM Corp. + */ +#ifndef _ASM_POWERPC_VDSO_ARCH_DATA_H +#define _ASM_POWERPC_VDSO_ARCH_DATA_H + +#include +#include + +#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32) + +#ifdef CONFIG_PPC64 + +struct vdso_arch_data { + __u64 tb_ticks_per_sec; /* Timebase tics / sec */ + __u32 dcache_block_size; /* L1 d-cache block size */ + __u32 icache_block_size; /* L1 i-cache block size */ + __u32 dcache_log_block_size; /* L1 d-cache log block size */ + __u32 icache_log_block_size; /* L1 i-cache log block size */ + __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ + __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */ +}; + +#else /* CONFIG_PPC64 */ + +struct vdso_arch_data { + __u64 tb_ticks_per_sec; /* Timebase tics / sec */ + __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ + __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */ +}; + +#endif /* CONFIG_PPC64 */ + +#endif /* _ASM_POWERPC_VDSO_ARCH_DATA_H */ diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h index 80ce0709725eb..067a5396aac6e 100644 --- a/arch/powerpc/include/asm/vdso/getrandom.h +++ b/arch/powerpc/include/asm/vdso/getrandom.h @@ -43,20 +43,21 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig (unsigned long)len, (unsigned long)flags); } -static __always_inline struct vdso_rng_data *__arch_get_vdso_rng_data(void) +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) { - struct vdso_arch_data *data; + struct vdso_rng_data *data; asm ( " bcl 20, 31, .+4 ;" "0: mflr %0 ;" - " addis %0, %0, (_vdso_datapage - 0b)@ha ;" - " addi %0, %0, (_vdso_datapage - 0b)@l ;" + " addis %0, %0, (vdso_u_rng_data - 0b)@ha ;" + " addi %0, %0, (vdso_u_rng_data - 0b)@l ;" : "=r" (data) : : "lr" ); - return &data->rng_data; + return data; } +#define __arch_get_vdso_u_rng_data __arch_get_vdso_u_rng_data ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index c6390890a60c2..99c9d6f43fde2 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -94,22 +94,12 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) #endif static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return get_tb(); } -const struct vdso_data *__arch_get_vdso_data(void); - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return (void *)vd + (1U << CONFIG_PAGE_SHIFT); -} -#endif - -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { return true; } @@ -135,21 +125,22 @@ static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift) #ifdef __powerpc64__ int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, - const struct vdso_data *vd); + const struct vdso_time_data *vd); #else int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, - const struct vdso_data *vd); + const struct vdso_time_data *vd); #endif int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, - const struct vdso_data *vd); + const struct vdso_time_data *vd); __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, - const struct vdso_data *vd); + const struct vdso_time_data *vd); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h index 48560a1195595..c2c9ae1b22e71 100644 --- a/arch/powerpc/include/asm/vdso/vsyscall.h +++ b/arch/powerpc/include/asm/vdso/vsyscall.h @@ -6,19 +6,6 @@ #include -static __always_inline -struct vdso_data *__arch_get_k_vdso_data(void) -{ - return vdso_data->data; -} -#define __arch_get_k_vdso_data __arch_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__arch_get_k_vdso_rng_data(void) -{ - return &vdso_data->rng_data; -} - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index a202f5b634795..95d45a50355d2 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -11,56 +11,18 @@ #ifndef __ASSEMBLY__ -#include -#include #include -#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32) - -#ifdef CONFIG_PPC64 - -struct vdso_arch_data { - __u64 tb_ticks_per_sec; /* Timebase tics / sec */ - __u32 dcache_block_size; /* L1 d-cache block size */ - __u32 icache_block_size; /* L1 i-cache block size */ - __u32 dcache_log_block_size; /* L1 d-cache log block size */ - __u32 icache_log_block_size; /* L1 i-cache log block size */ - __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ - __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */ - - struct vdso_rng_data rng_data; - - struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); -}; - -#else /* CONFIG_PPC64 */ - -struct vdso_arch_data { - __u64 tb_ticks_per_sec; /* Timebase tics / sec */ - __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ - __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */ - struct vdso_rng_data rng_data; - - struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); -}; - -#endif /* CONFIG_PPC64 */ - -extern struct vdso_arch_data *vdso_data; - #else /* __ASSEMBLY__ */ -.macro get_datapage ptr offset=0 +.macro get_datapage ptr symbol bcl 20, 31, .+4 999: mflr \ptr - addis \ptr, \ptr, (_vdso_datapage - 999b + \offset)@ha - addi \ptr, \ptr, (_vdso_datapage - 999b + \offset)@l + addis \ptr, \ptr, (\symbol - 999b)@ha + addi \ptr, \ptr, (\symbol - 999b)@l .endm -#include -#include - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7a390bd4f4af3..b3048f6d3822c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -334,7 +334,6 @@ int main(void) #endif /* ! CONFIG_PPC64 */ /* datapage offsets for use by vdso */ - OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data); OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec); #ifdef CONFIG_PPC64 OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 0727332ad86fb..15784c5c95c77 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -950,7 +950,7 @@ void __init time_init(void) sys_tz.tz_dsttime = 0; } - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_k_arch_data->tb_ticks_per_sec = tb_ticks_per_sec; #ifdef CONFIG_PPC64_PROC_SYSTEMCFG systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; #endif diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 43379365ce1b3..219d67bcf747e 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -32,6 +32,8 @@ #include #include +static_assert(__VDSO_PAGES == VDSO_NR_PAGES); + /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) @@ -40,24 +42,6 @@ extern char vdso64_start, vdso64_end; long sys_ni_syscall(void); -/* - * The vdso data page (aka. systemcfg for old ppc64 fans) is here. - * Once the early boot kernel code no longer needs to muck around - * with it, it will become dynamically allocated - */ -static union { - struct vdso_arch_data data; - u8 page[2 * PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_arch_data *vdso_data = &vdso_data_store.data; - -enum vvar_pages { - VVAR_BASE_PAGE_OFFSET, - VVAR_TIME_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, unsigned long text_size) { @@ -96,14 +80,6 @@ static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struc mm->context.vdso = NULL; } -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf); - -static struct vm_special_mapping vvar_spec __ro_after_init = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, @@ -116,73 +92,6 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { .close = vdso_close, }; -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return vvar_page; -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - VMA_ITERATOR(vmi, mm, 0); - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_spec)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_BASE_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - break; - case VVAR_TIME_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = virt_to_pfn(vdso_data->data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data->data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree @@ -191,7 +100,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int { unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec; - unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE; + unsigned long vvar_size = VDSO_NR_PAGES * PAGE_SIZE; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -217,9 +126,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); - vma = _install_special_mapping(mm, vdso_base, vvar_size, - VM_READ | VM_MAYREAD | VM_IO | - VM_DONTDUMP | VM_PFNMAP, &vvar_spec); + vma = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -299,10 +206,10 @@ static void __init vdso_setup_syscall_map(void) for (i = 0; i < NR_syscalls; i++) { if (sys_call_table[i] != (void *)&sys_ni_syscall) - vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); + vdso_k_arch_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); if (IS_ENABLED(CONFIG_COMPAT) && compat_sys_call_table[i] != (void *)&sys_ni_syscall) - vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); + vdso_k_arch_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); } } @@ -352,10 +259,10 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) static int __init vdso_init(void) { #ifdef CONFIG_PPC64 - vdso_data->dcache_block_size = ppc64_caches.l1d.block_size; - vdso_data->icache_block_size = ppc64_caches.l1i.block_size; - vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; - vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; + vdso_k_arch_data->dcache_block_size = ppc64_caches.l1d.block_size; + vdso_k_arch_data->icache_block_size = ppc64_caches.l1i.block_size; + vdso_k_arch_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; + vdso_k_arch_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; #endif /* CONFIG_PPC64 */ vdso_setup_syscall_map(); diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 0e3ed6fb199ff..e8824f9333261 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -3,7 +3,7 @@ # List of files in the vdso, has to be asm only for now # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o obj-vdso64 = sigtramp64-64.o gettimeofday-64.o datapage-64.o cacheflush-64.o note-64.o getcpu-64.o diff --git a/arch/powerpc/kernel/vdso/cacheflush.S b/arch/powerpc/kernel/vdso/cacheflush.S index 0085ae464dac9..488d3ade11e64 100644 --- a/arch/powerpc/kernel/vdso/cacheflush.S +++ b/arch/powerpc/kernel/vdso/cacheflush.S @@ -30,7 +30,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) #ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 - get_datapage r10 + get_datapage r10 vdso_u_arch_data mtlr r12 .cfi_restore lr #endif diff --git a/arch/powerpc/kernel/vdso/datapage.S b/arch/powerpc/kernel/vdso/datapage.S index db8e167f01667..d23b2e8e2a34c 100644 --- a/arch/powerpc/kernel/vdso/datapage.S +++ b/arch/powerpc/kernel/vdso/datapage.S @@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mflr r12 .cfi_register lr,r12 mr. r4,r3 - get_datapage r3 + get_datapage r3 vdso_u_arch_data mtlr r12 #ifdef __powerpc64__ addi r3,r3,CFG_SYSCALL_MAP64 @@ -52,7 +52,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - get_datapage r3 + get_datapage r3 vdso_u_arch_data #ifndef __powerpc64__ lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) #endif diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index 5333848322ca6..79c9672124447 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -33,9 +33,9 @@ .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif .ifeq \call_time - get_datapage r5 VDSO_DATA_OFFSET + get_datapage r5 vdso_u_time_data .else - get_datapage r4 VDSO_DATA_OFFSET + get_datapage r4 vdso_u_time_data .endif bl CFUNC(DOTSYM(\funct)) PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 1a1b0b6d681a9..72a1012b8a205 100644 --- a/arch/powerpc/kernel/vdso/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") @@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common) SECTIONS { - PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index e21b5506cad62..32102a05eaa7e 100644 --- a/arch/powerpc/kernel/vdso/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -6,6 +6,7 @@ #include #include #include +#include #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") @@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common64) SECTIONS { - PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vgettimeofday.c b/arch/powerpc/kernel/vdso/vgettimeofday.c index 55a287c9a7366..6f5167d81af5f 100644 --- a/arch/powerpc/kernel/vdso/vgettimeofday.c +++ b/arch/powerpc/kernel/vdso/vgettimeofday.c @@ -7,43 +7,43 @@ #ifdef __powerpc64__ int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime_data(vd, clock, ts); } int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_getres_data(vd, clock_id, res); } #else int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime32_data(vd, clock, ts); } int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime_data(vd, clock, ts); } int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_getres_time32_data(vd, clock_id, res); } #endif int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_gettimeofday_data(vd, tv, tz); } -__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd) +__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_time_data *vd) { return __cvdso_time_data(vd, time); } diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 8c464a5d82469..2429cb1c7baa7 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -495,8 +495,7 @@ static void start_watchdog(void *arg) *this_cpu_ptr(&wd_timer_tb) = get_tb(); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms), HRTIMER_MODE_REL_PINNED); } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ce1d91eed231b..61f2b7e007fad 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -766,8 +766,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + hrtimer_setup(&vcpu->arch.dec_timer, kvmppc_decrementer_wakeup, CLOCK_REALTIME, + HRTIMER_MODE_ABS); #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7612c52e9b1e3..0409be7686663 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_TIME_DATA + select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -111,11 +111,13 @@ config RISCV select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_LIB_DEVMEM_IS_ALLOWED + select GENERIC_PENDING_IRQ if SMP select GENERIC_PCI_IOMAP select GENERIC_PTDUMP if MMU select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT + select GENERIC_VDSO_DATA_STORE if MMU select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index e62ac51ac55ab..fef2a0e0f7a3f 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -173,6 +173,16 @@ #clock-cells = <1>; }; + msi: msi-controller@7030010304 { + compatible = "sophgo,sg2042-msi"; + reg = <0x70 0x30010304 0x0 0x4>, + <0x70 0x30010300 0x0 0x4>; + reg-names = "clr", "doorbell"; + msi-controller; + #msi-cells = <0>; + msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + }; + rpgate: clock-controller@7030010368 { compatible = "sophgo,sg2042-rpgate"; reg = <0x70 0x30010368 0x0 0x98>; diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 4cadc56220fea..427c41dde6431 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -231,7 +231,7 @@ __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx, \ sc_prepend, sc_append, \ cas_prepend, cas_append, \ - __ret, __ptr, (long), __old, __new); \ + __ret, __ptr, (long)(int)(long), __old, __new); \ break; \ case 8: \ __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx, \ diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 72be100afa236..90c86b115e008 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -93,7 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %[r]) \ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %[r]) \ : [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr), [t] "=&r" (tmp) - : [ov] "Jr" (oldval), [nv] "Jr" (newval) + : [ov] "Jr" ((long)(int)oldval), [nv] "Jr" (newval) : "memory"); __disable_user_access(); diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index faf3624d80577..4461264977684 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -28,7 +28,8 @@ void set_huge_pte_at(struct mm_struct *mm, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + unsigned long addr, pte_t *ptep, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1c5c641075d2f..0257f4aa7ff45 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -136,7 +136,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #include #ifdef CONFIG_MMU -#define arch_memremap_wb(addr, size) \ +#define arch_memremap_wb(addr, size, flags) \ ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) #endif diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index f891478829a52..c130d8100232c 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -14,7 +14,7 @@ */ #ifdef CONFIG_MMU -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ #include diff --git a/arch/riscv/include/asm/vdso/time_data.h b/arch/riscv/include/asm/vdso/arch_data.h similarity index 71% rename from arch/riscv/include/asm/vdso/time_data.h rename to arch/riscv/include/asm/vdso/arch_data.h index dfa65228999be..da57a3786f7a5 100644 --- a/arch/riscv/include/asm/vdso/time_data.h +++ b/arch/riscv/include/asm/vdso/arch_data.h @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __RISCV_ASM_VDSO_TIME_DATA_H -#define __RISCV_ASM_VDSO_TIME_DATA_H +#ifndef __RISCV_ASM_VDSO_ARCH_DATA_H +#define __RISCV_ASM_VDSO_ARCH_DATA_H #include #include #include -struct arch_vdso_time_data { +struct vdso_arch_data { /* Stash static answers to the hwprobe queries when all CPUs are selected. */ __u64 all_cpu_hwprobe_values[RISCV_HWPROBE_MAX_KEY + 1]; @@ -14,4 +14,4 @@ struct arch_vdso_time_data { __u8 homogeneous_cpus; }; -#endif /* __RISCV_ASM_VDSO_TIME_DATA_H */ +#endif /* __RISCV_ASM_VDSO_ARCH_DATA_H */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index ba3283cf7acca..29164f84f93ce 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -69,7 +69,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) #endif /* CONFIG_GENERIC_TIME_VSYSCALL */ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { /* * The purpose of csr_read(CSR_TIME) is to trap the system into @@ -79,18 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return csr_read(CSR_TIME); } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h index e8a9c4b53c0c9..1140b54b4bc82 100644 --- a/arch/riscv/include/asm/vdso/vsyscall.h +++ b/arch/riscv/include/asm/vdso/vsyscall.h @@ -6,15 +6,6 @@ #include -extern struct vdso_data *vdso_data; - -static __always_inline struct vdso_data *__riscv_get_k_vdso_data(void) -{ - return vdso_data; -} - -#define __arch_get_k_vdso_data __riscv_get_k_vdso_data - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 2d40736fc37ce..26b085dbdd073 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -108,11 +108,11 @@ int populate_cache_leaves(unsigned int cpu) if (!np) return -ENOENT; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); prev = np; @@ -125,11 +125,11 @@ int populate_cache_leaves(unsigned int cpu) break; if (level <= levels) break; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); levels = level; } diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index c6ba750536c32..40ac72e407b68 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -479,7 +479,7 @@ static void __init riscv_resolve_isa(unsigned long *source_isa, if (bit < RISCV_ISA_EXT_BASE) *this_hwcap |= isa2hwcap[bit]; } - } while (loop && memcmp(prev_resolved_isa, resolved_isa, sizeof(prev_resolved_isa))); + } while (loop && !bitmap_equal(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX)); } static void __init match_isa_ext(const char *name, const char *name_end, unsigned long *bitmap) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f1793630fc518..4fe45daa6281e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -322,8 +322,8 @@ void __init setup_arch(char **cmdline_p) riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); - init_rt_signal_env(); apply_boot_alternatives(); + init_rt_signal_env(); if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && riscv_isa_extension_available(NULL, ZICBOM)) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 94e905eea1dee..08378fea3a111 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -215,12 +215,6 @@ static size_t get_rt_frame_size(bool cal_all) if (cal_all || riscv_v_vstate_query(task_pt_regs(current))) total_context_size += riscv_v_sc_size; } - /* - * Preserved a __riscv_ctx_hdr for END signal context header if an - * extension uses __riscv_extra_ext_header - */ - if (total_context_size) - total_context_size += sizeof(struct __riscv_ctx_hdr); frame_size += total_context_size; diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index bcd3b816306c2..04a4e54955128 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -450,8 +450,7 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs, static int __init init_hwprobe_vdso_data(void) { - struct vdso_data *vd = __arch_get_k_vdso_data(); - struct arch_vdso_time_data *avd = &vd->arch_data; + struct vdso_arch_data *avd = vdso_k_arch_data; u64 id_bitsmash = 0; struct riscv_hwprobe pair; int key; diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 3ca3ae4277e18..cc2895d1fbc2f 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -13,20 +13,11 @@ #include #include #include -#include +#include #include #include -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - -#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) - -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; +#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) struct __vdso_info { const char *name; @@ -79,78 +70,6 @@ static void __init __vdso_init(struct __vdso_info *vdso_info) vdso_info->cm->pages = vdso_pagelist; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static const struct vm_special_mapping rv_vvar_map; - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &rv_vvar_map)) - zap_vma_pages(vma); - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static const struct vm_special_mapping rv_vvar_map = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping rv_vdso_map __ro_after_init = { .name = "[vdso]", .mremap = vdso_mremap, @@ -196,7 +115,7 @@ static int __setup_additional_pages(struct mm_struct *mm, unsigned long vdso_base, vdso_text_len, vdso_mapping_len; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info->vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ @@ -208,8 +127,7 @@ static int __setup_additional_pages(struct mm_struct *mm, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE, - (VM_READ | VM_MAYREAD | VM_PFNMAP), &rv_vvar_map); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 9a1b555e87331..ad73607abc280 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -2,7 +2,7 @@ # Copied from arch/tile/kernel/vdso/Makefile # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Symbols present in the vdso vdso-syms = rt_sigreturn ifdef CONFIG_64BIT diff --git a/arch/riscv/kernel/vdso/hwprobe.c b/arch/riscv/kernel/vdso/hwprobe.c index a158c029344f6..2ddeba6c68dda 100644 --- a/arch/riscv/kernel/vdso/hwprobe.c +++ b/arch/riscv/kernel/vdso/hwprobe.c @@ -16,8 +16,7 @@ static int riscv_vdso_get_values(struct riscv_hwprobe *pairs, size_t pair_count, size_t cpusetsize, unsigned long *cpus, unsigned int flags) { - const struct vdso_data *vd = __arch_get_vdso_data(); - const struct arch_vdso_time_data *avd = &vd->arch_data; + const struct vdso_arch_data *avd = &vdso_u_arch_data; bool all_cpus = !cpusetsize && !cpus; struct riscv_hwprobe *p = pairs; struct riscv_hwprobe *end = pairs + pair_count; @@ -51,8 +50,7 @@ static int riscv_vdso_get_cpus(struct riscv_hwprobe *pairs, size_t pair_count, size_t cpusetsize, unsigned long *cpus, unsigned int flags) { - const struct vdso_data *vd = __arch_get_vdso_data(); - const struct arch_vdso_time_data *avd = &vd->arch_data; + const struct vdso_arch_data *avd = &vdso_u_arch_data; struct riscv_hwprobe *p = pairs; struct riscv_hwprobe *end = pairs + pair_count; unsigned char *c = (unsigned char *)cpus; diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index cbe2a179331d2..8e86965a8aae4 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -4,15 +4,14 @@ */ #include #include +#include OUTPUT_ARCH(riscv) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index a8085cd8215e3..29ef9c2133a93 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -974,7 +974,6 @@ int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, if (imsic->vsfile_cpu >= 0) { writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); - kvm_vcpu_kick(vcpu); } else { eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip); diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index dce667f4b6ab0..3070bb31745de 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -79,12 +80,12 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) return SBI_ERR_INVALID_PARAM; - if (!kvm_riscv_vcpu_stopped(target_vcpu)) - return SBI_HSM_STATE_STARTED; - else if (vcpu->stat.generic.blocking) + if (kvm_riscv_vcpu_stopped(target_vcpu)) + return SBI_HSM_STATE_STOPPED; + else if (target_vcpu->stat.generic.blocking) return SBI_HSM_STATE_SUSPENDED; else - return SBI_HSM_STATE_STOPPED; + return SBI_HSM_STATE_STARTED; } static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, @@ -109,7 +110,7 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, } return 0; case SBI_EXT_HSM_HART_SUSPEND: - switch (cp->a0) { + switch (lower_32_bits(cp->a0)) { case SBI_HSM_SUSPEND_RET_DEFAULT: kvm_riscv_vcpu_wfi(vcpu); break; diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9c2ab3dfa93aa..5fbf3f94f1e85 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -21,7 +21,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, u64 next_cycle; if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } @@ -51,9 +51,10 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; + unsigned long hart_bit = 0, sentmask = 0; if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } @@ -62,15 +63,23 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (hbase != -1UL) { if (tmp->vcpu_id < hbase) continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) + hart_bit = tmp->vcpu_id - hbase; + if (hart_bit >= __riscv_xlen) + goto done; + if (!(hmask & (1UL << hart_bit))) continue; } ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); if (ret < 0) break; + sentmask |= 1UL << hart_bit; kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); } +done: + if (hbase != -1UL && (hmask ^ sentmask)) + retdata->err_val = SBI_ERR_INVALID_PARAM; + return ret; } diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c index 5d55e08791fa1..bc0ebba890037 100644 --- a/arch/riscv/kvm/vcpu_sbi_system.c +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -19,7 +20,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, switch (funcid) { case SBI_EXT_SUSP_SYSTEM_SUSPEND: - if (cp->a0 != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { + if (lower_32_bits(cp->a0) != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { retdata->err_val = SBI_ERR_INVALID_PARAM; return 0; } diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 96e7a4e463f7f..ff672fa71fccf 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -248,18 +248,19 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu) if (t->init_done) return -EINVAL; - hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); t->init_done = true; t->next_set = false; /* Enable sstc for every vcpu if available in hardware */ if (riscv_isa_extension_available(NULL, SSTC)) { t->sstc_enabled = true; - t->hrt.function = kvm_riscv_vcpu_vstimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_vstimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_vstimecmp; } else { t->sstc_enabled = false; - t->hrt.function = kvm_riscv_vcpu_hrtimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_hrtimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_hrtimer; } diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 42314f0939220..b4a78a4b35cff 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -293,7 +293,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t orig_pte = ptep_get(ptep); int pte_num; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9c9ec08d78c71..7ed6f229250cf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -166,6 +166,7 @@ config S390 select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 885bd1dd2c82f..9276e0576d0ab 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -86,7 +86,7 @@ static int cmma_test_essa(void) : [reg1] "=&d" (reg1), [reg2] "=&a" (reg2), [rc] "+&d" (rc), - [tmp] "=&d" (tmp), + [tmp] "+&d" (tmp), "+Q" (get_lowcore()->program_new_psw), "=Q" (old) : [psw_old] "a" (&old), diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 44f01a4bc810f..80bdfbae6e5b4 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -469,6 +469,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -874,6 +875,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8bcd37edd3c97..449a0e996b963 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -459,6 +459,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -825,6 +826,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7c52acaf9f828..663e87220e89f 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,8 +25,16 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) +{ + return __huge_ptep_get_and_clear(mm, addr, ptep); +} static inline void arch_clear_hugetlb_flags(struct folio *folio) { @@ -48,7 +56,7 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -59,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; @@ -69,7 +77,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 92c73e4d97a93..420a073fdde5d 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -6,13 +6,11 @@ #ifndef __ASSEMBLY__ -extern struct vdso_data *vdso_data; - int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #define VDSO_VERSION_STRING LINUX_2.6.29 diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h index 36355af7160be..f8713ce39bb2f 100644 --- a/arch/s390/include/asm/vdso/getrandom.h +++ b/arch/s390/include/asm/vdso/getrandom.h @@ -23,18 +23,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags); } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - /* - * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace - * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ - * PAGE_OFFSET points to the real VVAR page. - */ - if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - return &_vdso_rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index 7937765ccfa5b..fb4564308e9d7 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -14,12 +14,7 @@ #include -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { u64 adj, now; @@ -49,12 +44,4 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) return syscall2(__NR_clock_getres, (long)clkid, (long)ts); } -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h index 3eb576ecd3bd9..d346ebe513015 100644 --- a/arch/s390/include/asm/vdso/vsyscall.h +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -2,32 +2,12 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 768 - #ifndef __ASSEMBLY__ #include #include #include -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES -}; - -static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __s390_get_k_vdso_data - -static __always_inline struct vdso_rng_data *__s390_get_k_vdso_rnd_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __s390_get_k_vdso_rnd_data - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 63ba6306632ef..e540b022ceb23 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -266,12 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; + unsigned long sp = arch_ftrace_regs(fregs)->regs.gprs[15]; if (unlikely(ftrace_graph_is_dead())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + if (!function_graph_enter_regs(*parent, ip, 0, (unsigned long *)sp, fregs)) *parent = (unsigned long)&return_to_handler; } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e9f47c3a61978..699a18f1c54eb 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -79,12 +79,10 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; - int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - for (cs = 0; cs < CS_BASES; cs++) - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -373,7 +371,6 @@ static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; - int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -389,10 +386,8 @@ static void clock_sync_global(long delta) panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - for (cs = 0; cs < CS_BASES; cs++) { - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; - vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; - } + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 24fee11b030d8..b746213d3110c 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -285,10 +285,10 @@ static void __init test_monitor_call(void) return; asm volatile( " mc 0,0\n" - "0: xgr %0,%0\n" + "0: lhi %[val],0\n" "1:\n" - EX_TABLE(0b,1b) - : "+d" (val)); + EX_TABLE(0b, 1b) + : [val] "+d" (val)); if (!val) panic("Monitor call doesn't work!\n"); } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 598b512cde012..70c8f9ad13cde 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -16,8 +16,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -26,85 +26,6 @@ extern char vdso64_start[], vdso64_end[]; extern char vdso32_start[], vdso32_end[]; -static struct vm_special_mapping vvar_mapping; - -static union vdso_data_store vdso_data_store __page_aligned_data; - -struct vdso_data *vdso_data = vdso_data_store.data; - -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The VVAR page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will be re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - VMA_ITERATOR(vmi, mm, 0); - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (!vma_is_special_mapping(vma, &vvar_mapping)) - continue; - zap_vma_pages(vma); - break; - } - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - if (timens_page) { - /* - * Fault in VVAR page too, since it will be accessed - * to get clock data anyway. - */ - addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - pfn = page_to_pfn(timens_page); - } - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - return vmf_insert_pfn(vma, vmf->address, pfn); -} - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { @@ -112,11 +33,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping vdso64_mapping = { .name = "[vdso]", .mremap = vdso_mremap, @@ -142,7 +58,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) struct vm_area_struct *vma; int rc; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -157,14 +73,11 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + vma = vdso_install_vvar_mapping(mm, vvar_start); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| @@ -220,7 +133,7 @@ unsigned long vdso_text_size(void) unsigned long vdso_size(void) { - return vdso_text_size() + VVAR_NR_PAGES * PAGE_SIZE; + return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE; } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 2c5afb88d2982..1e4ddd1a683ff 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso32 = vdso_user_wrapper-32.o note-32.o # Build rules diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index c916c4f73f766..9630d58c20806 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -6,16 +6,15 @@ #include #include +#include OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") OUTPUT_ARCH(s390:31-bit) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index ad206f2068d8c..d8f0df7428096 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index ec42b7d9cb530..e4f6551ae8988 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -7,17 +7,15 @@ #include #include #include +#include OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 07ff0e10cb7f5..0f00f8e85feec 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3174,8 +3174,7 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); gi->expires = 50 * 1000; /* 50 usec */ - hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - gi->timer.function = gisa_vcpu_kicker; + hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL); memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)virt_to_phys(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ebecb96bacce7..1066c6ac59014 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3943,8 +3943,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) if (rc) return rc; } - hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vcpu->arch.sie_block->hpid = HPID_KVM; diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de9..2e568f175cd41 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -188,8 +188,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bdcf2a3b6c41b..bd39b36e7bd68 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -8,7 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY +CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) @@ -19,9 +19,11 @@ KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS += -D__DISABLE_EXPORTS # Since we link purgatory with -r unresolved symbols are not checked, so we # also link a purgatory.chk binary without -r to check for unresolved symbols. diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index c714ca6a05aa0..e7a9cdd498dca 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,7 +20,7 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index eee601a0d2cfb..80504148d8a5b 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -260,7 +260,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { unsigned int i, nptes, orig_shift, shift; unsigned long size; diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 50ec2978cda53..fdc4a8f5a49c5 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -22,7 +22,7 @@ targets += $(foreach x, 32 64, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 --no-undefined \ +VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 \ -z max-page-size=8192 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE @@ -101,7 +101,6 @@ $(obj)/vdso32.so.dbg: FORCE \ quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(src)/checkundef.sh '$(OBJDUMP)' '$@' + -T $(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic --no-undefined diff --git a/arch/sparc/vdso/checkundef.sh b/arch/sparc/vdso/checkundef.sh deleted file mode 100644 index 2d85876ffc325..0000000000000 --- a/arch/sparc/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -objdump="$1" -file="$2" -$objdump -t "$file" | grep '*UUND*' | grep -v '#scratch' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797ec..8be91974e786d 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void apply_seal_endbr(s32 *start, s32 *end) { } -void apply_retpolines(s32 *start, s32 *end, struct module *mod) +void apply_retpolines(s32 *start, s32 *end) { } -void apply_returns(s32 *start, s32 *end, struct module *mod) +void apply_returns(s32 *start, s32 *end) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be2c311f5118d..1665ebaba2514 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_EXECMEM_ROX if X86_64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -132,7 +133,7 @@ config X86 select ARCH_SUPPORTS_AUTOFDO_CLANG select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 + select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -178,6 +179,7 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE @@ -232,7 +234,7 @@ config X86 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_EISA + select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE @@ -277,7 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API @@ -285,7 +287,7 @@ config X86 select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK - select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR + select HAVE_STACKPROTECTOR select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL @@ -426,15 +428,6 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 -config CC_HAS_SANE_STACKPROTECTOR - bool - default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT - default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) - help - We have to make sure stack protector is unconditionally disabled if - the compiler produces broken code or if it does not let us control - the segment on 32-bit kernels. - menu "Processor type and features" config SMP @@ -530,12 +523,6 @@ config X86_FRED ring transitions and exception/interrupt handling if the system supports it. -config X86_BIGSMP - bool "Support for big SMP systems with more than 8 CPUs" - depends on SMP && X86_32 - help - This option is needed for the systems that have more than 8 CPUs. - config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -553,13 +540,12 @@ config X86_EXTENDED_PLATFORM AMD Elan RDC R-321x SoC SGI 320/540 (Visual Workstation) - STA2X11-based (e.g. Northville) - Moorestown MID devices 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet + Merrifield/Moorefield MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -604,8 +590,31 @@ config X86_UV This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. -# Following is an alphabetically sorted list of 32 bit extended platforms -# Please maintain the alphabetic order if and when there are additions +config X86_INTEL_MID + bool "Intel Z34xx/Z35xx MID platform support" + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on PCI + depends on X86_64 || (EXPERT && PCI_GOANY) + depends on X86_IO_APIC + select I2C + select DW_APB_TIMER + select INTEL_SCU_PCI + help + Select to build a kernel capable of supporting 64-bit Intel MID + (Mobile Internet Device) platform systems which do not have + the PCI legacy interfaces. + + The only supported devices are the 22nm Merrified (Z34xx) + and Moorefield (Z35xx) SoC used in the Intel Edison board and + a small number of Android devices such as the Asus Zenfone 2, + Asus FonePad 8 and Dell Venue 7. + + If you are building for a PC class system or non-MID tablet + SoCs like Bay Trail (Z36xx/Z37xx), say N here. + + Intel MID platforms are based on an Intel processor and chipset which + consume less power than most of the x86 derivatives. config X86_GOLDFISH bool "Goldfish (Virtual Platform)" @@ -615,6 +624,9 @@ config X86_GOLDFISH for Android development. Unless you are building for the Android Goldfish emulator say N here. +# Following is an alphabetically sorted list of 32 bit extended platforms +# Please maintain the alphabetic order if and when there are additions + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -630,24 +642,6 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. -config X86_INTEL_MID - bool "Intel MID platform support" - depends on X86_EXTENDED_PLATFORM - depends on X86_PLATFORM_DEVICES - depends on PCI - depends on X86_64 || (PCI_GOANY && X86_32) - depends on X86_IO_APIC - select I2C - select DW_APB_TIMER - select INTEL_SCU_PCI - help - Select to build a kernel capable of supporting Intel MID (Mobile - Internet Device) platform systems which do not have the PCI legacy - interfaces. If you are building for a PC class system say N here. - - Intel MID platforms are based on an Intel processor and chipset which - consume less power than most of the x86 derivatives. - config X86_INTEL_QUARK bool "Intel Quark platform support" depends on X86_32 @@ -729,18 +723,6 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config X86_32_NON_STANDARD - bool "Support non-standard 32-bit SMP architectures" - depends on X86_32 && SMP - depends on X86_EXTENDED_PLATFORM - help - This option compiles in the bigsmp and STA2X11 default - subarchitectures. It is intended for a generic binary - kernel. If you select them all, kernel will probe it one by - one and will fallback to default. - -# Alphabetically sorted list of Non standard 32 bit platforms - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): @@ -750,19 +732,6 @@ config X86_SUPPORTS_MEMORY_FAILURE depends on X86_64 || !SPARSEMEM select ARCH_SUPPORTS_MEMORY_FAILURE -config STA2X11 - bool "STA2X11 Companion Chip Support" - depends on X86_32_NON_STANDARD && PCI - select SWIOTLB - select MFD_STA2X11 - select GPIOLIB - help - This adds support for boards based on the STA2X11 IO-Hub, - a.k.a. "ConneXt". The chip is used in place of the standard - PC chipset, so all "standard" peripherals are missing. If this - option is selected the kernel will still be able to boot on - standard PC machines. - config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" depends on X86_32 @@ -1012,8 +981,7 @@ config NR_CPUS_RANGE_BEGIN config NR_CPUS_RANGE_END int depends on X86_32 - default 64 if SMP && X86_BIGSMP - default 8 if SMP && !X86_BIGSMP + default 8 if SMP default 1 if !SMP config NR_CPUS_RANGE_END @@ -1026,7 +994,6 @@ config NR_CPUS_RANGE_END config NR_CPUS_DEFAULT int depends on X86_32 - default 32 if X86_BIGSMP default 8 if SMP default 1 if !SMP @@ -1102,7 +1069,7 @@ config UP_LATE_INIT config X86_UP_APIC bool "Local APIC support on uniprocessors" if !PCI_MSI default PCI_MSI - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -1127,7 +1094,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI + depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY config ACPI_MADT_WAKEUP @@ -1341,6 +1308,7 @@ config X86_REBOOTFIXUPS config MICROCODE def_bool y depends on CPU_SUP_AMD || CPU_SUP_INTEL + select CRYPTO_LIB_SHA256 if CPU_SUP_AMD config MICROCODE_INITRD32 def_bool y @@ -1395,15 +1363,11 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -choice - prompt "High Memory Support" - default HIGHMEM4G +config HIGHMEM4G + bool "High Memory Support" depends on X86_32 - -config NOHIGHMEM - bool "off" help - Linux can use up to 64 Gigabytes of physical memory on x86 systems. + Linux can use up to 4 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 Gigabytes large. That means that, if you have a large amount of physical memory, not all of it can be "permanently mapped" by the @@ -1419,38 +1383,9 @@ config NOHIGHMEM possible. If the machine has between 1 and 4 Gigabytes physical RAM, then - answer "4GB" here. - - If more than 4 Gigabytes is used then answer "64GB" here. This - selection turns Intel PAE (Physical Address Extension) mode on. - PAE implements 3-level paging on IA32 processors. PAE is fully - supported by Linux, PAE mode is implemented on all recent Intel - processors (Pentium Pro and better). NOTE: If you say "64GB" here, - then the kernel will not boot on CPUs that don't support PAE! - - The actual amount of total physical memory will either be - auto detected or can be forced by using a kernel command line option - such as "mem=256M". (Try "man bootparam" or see the documentation of - your boot loader (lilo or loadlin) about how to pass options to the - kernel at boot time.) - - If unsure, say "off". - -config HIGHMEM4G - bool "4GB" - help - Select this if you have a 32-bit processor and between 1 and 4 - gigabytes of physical RAM. - -config HIGHMEM64G - bool "64GB" - depends on X86_HAVE_PAE - select X86_PAE - help - Select this if you have a 32-bit processor and more than 4 - gigabytes of physical RAM. + answer "Y" here. -endchoice + If unsure, say N. choice prompt "Memory split" if EXPERT @@ -1496,14 +1431,12 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - def_bool y - depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) + def_bool HIGHMEM4G config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && X86_HAVE_PAE select PHYS_ADDR_T_64BIT - select SWIOTLB help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It @@ -1573,8 +1506,7 @@ config AMD_MEM_ENCRYPT config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) - default y if X86_BIGSMP + depends on X86_64 select USE_PERCPU_NUMA_NODE_ID select OF_NUMA if OF help @@ -1587,9 +1519,6 @@ config NUMA For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. - For 32-bit this is only needed if you boot a 32-bit - kernel on a 64-bit NUMA platform. - Otherwise, you should say N. config AMD_NUMA @@ -1628,7 +1557,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD + depends on X86_64 || NUMA || X86_32 select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1674,15 +1603,6 @@ config X86_PMEM_LEGACY Say Y if unsure. -config HIGHPTE - bool "Allocate 3rd-level pagetables from highmem" - depends on HIGHMEM - help - The VM uses one page table entry for each page of physical memory. - For systems with a lot of RAM, this can be wasteful of precious - low memory. Setting this option will put user-space page table - entries in high memory. - config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" help @@ -2450,18 +2370,20 @@ config CC_HAS_NAMED_AS def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null) depends on CC_IS_GCC +# +# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN) +# are incompatible with named address spaces with GCC < 13.3 +# (see GCC PR sanitizer/111736 and also PR sanitizer/115172). +# + config CC_HAS_NAMED_AS_FIXED_SANITIZERS - def_bool CC_IS_GCC && GCC_VERSION >= 130300 + def_bool y + depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300 + depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200 config USE_X86_SEG_SUPPORT - def_bool y - depends on CC_HAS_NAMED_AS - # - # -fsanitize=kernel-address (KASAN) and -fsanitize=thread - # (KCSAN) are incompatible with named address spaces with - # GCC < 13.3 - see GCC PR sanitizer/111736. - # - depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS + def_bool CC_HAS_NAMED_AS + depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) @@ -2472,6 +2394,10 @@ config CC_HAS_RETURN_THUNK config CC_HAS_ENTRY_PADDING def_bool $(cc-option,-fpatchable-function-entry=16,16) +config CC_HAS_KCFI_ARITY + def_bool $(cc-option,-fsanitize=kcfi -fsanitize-kcfi-arity) + depends on CC_IS_CLANG && !RUST + config FUNCTION_PADDING_CFI int default 59 if FUNCTION_ALIGNMENT_64B @@ -2497,6 +2423,10 @@ config FINEIBT depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING +config FINEIBT_BHI + def_bool y + depends on FINEIBT && CC_HAS_KCFI_ARITY + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL @@ -3201,4 +3131,6 @@ config HAVE_ATOMIC_IOMAP source "arch/x86/kvm/Kconfig" +source "arch/x86/Kconfig.cpufeatures" + source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2a7279d80460a..753b8763abaeb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # Put here option for CPU selection and depending optimization choice - prompt "Processor family" - default M686 if X86_32 - default GENERIC_CPU if X86_64 + prompt "x86-32 Processor family" + depends on X86_32 + default M686 help This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -31,7 +31,6 @@ choice - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. @@ -42,13 +41,10 @@ choice - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - - "Intel P4" for the Pentium 4/Netburst microarchitecture. - - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. - "Intel Atom" for the Atom-microarchitecture CPUs. - - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. See each option's help text for additional details. If you don't know - what to do, choose "486". + what to do, choose "Pentium-Pro". config M486SX bool "486SX" @@ -114,11 +110,11 @@ config MPENTIUMIII extensions. config MPENTIUMM - bool "Pentium M" + bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo" depends on X86_32 help Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. + "Merom" Core Solo/Duo notebook chips config MPENTIUM4 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" @@ -139,22 +135,10 @@ config MPENTIUM4 -Mobile Pentium 4 -Mobile Pentium 4 M -Extreme Edition (Gallatin) - -Prescott - -Prescott 2M - -Cedar Mill - -Presler - -Smithfiled Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename: -Foster -Prestonia -Gallatin - -Nocona - -Irwindale - -Cranford - -Potomac - -Paxville - -Dempsey - config MK6 bool "K6/K6-II/K6-III" @@ -172,13 +156,6 @@ config MK7 some extended instructions, and passes appropriate optimization flags to GCC. -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - config MCRUSOE bool "Crusoe" depends on X86_32 @@ -258,42 +235,14 @@ config MVIAC7 Select this for a VIA C7. Selecting this uses the correct cache shift and tells gcc to treat the CPU as a 686. -config MPSC - bool "Intel P4 / older Netburst based Xeon" - depends on X86_64 - help - Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey - Xeon CPUs with Intel 64bit which is compatible with x86-64. - Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them - using the cpu family field - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - -config MCORE2 - bool "Core 2/newer Xeon" - help - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and - 53xx) CPUs. You can distinguish newer from older Xeons by the CPU - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - config MATOM bool "Intel Atom" help - Select this for the Intel Atom platform. Intel Atom CPUs have an in-order pipelining architecture and thus can benefit from accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. -config GENERIC_CPU - bool "Generic-x86-64" - depends on X86_64 - help - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - endchoice config X86_GENERIC @@ -317,8 +266,8 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int - default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "7" if MPENTIUM4 + default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64 default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -336,51 +285,35 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM - -# -# P6_NOPs are a relatively minor optimization that require a family >= -# 6 processor, except that it is broken on certain VIA chips. -# Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). In addition, it looks like Virtual PC -# does not understand them. -# -# As a result, disallow these if we're not compiling for X86_64 (these -# NOPs do work on all x86-64 capable chips); the list of processors in -# the right-hand clause are the cores that benefit from this optimization. -# -config X86_P6_NOP - def_bool y - depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64 config X86_HAVE_PAE def_bool y - depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64 + depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 -config X86_CMPXCHG64 +config X86_CX8 def_bool y - depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 + depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) - default "5" if X86_32 && X86_CMPXCHG64 + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) + default "5" if X86_32 && X86_CX8 default "4" config X86_DEBUGCTLMSR @@ -401,6 +334,10 @@ menuconfig PROCESSOR_SELECT This lets you choose what x86 vendor support code your kernel will include. +config BROADCAST_TLB_FLUSH + def_bool y + depends on CPU_SUP_AMD && 64BIT + config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures new file mode 100644 index 0000000000000..e12d5b7e39a25 --- /dev/null +++ b/arch/x86/Kconfig.cpufeatures @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# x86 feature bits (see arch/x86/include/asm/cpufeatures.h) that are +# either REQUIRED to be enabled, or DISABLED (always ignored) for this +# particular compile-time configuration. The tests for these features +# are turned into compile-time constants via the generated +# . +# +# The naming of these variables *must* match asm/cpufeatures.h, e.g., +# X86_FEATURE_ALWAYS <==> X86_REQUIRED_FEATURE_ALWAYS +# X86_FEATURE_FRED <==> X86_DISABLED_FEATURE_FRED +# +# And these REQUIRED and DISABLED config options are manipulated in an +# AWK script as the following example: +# +# +----------------------+ +# | X86_FRED = y ? | +# +----------------------+ +# / \ +# Y / \ N +# +-------------------------------------+ +-------------------------------+ +# | X86_DISABLED_FEATURE_FRED undefined | | X86_DISABLED_FEATURE_FRED = y | +# +-------------------------------------+ +-------------------------------+ +# | +# | +# +-------------------------------------------+ | +# | X86_FEATURE_FRED: feature word 12, bit 17 | ---->| +# +-------------------------------------------+ | +# | +# | +# +-------------------------------+ +# | set bit 17 of DISABLED_MASK12 | +# +-------------------------------+ +# + +config X86_REQUIRED_FEATURE_ALWAYS + def_bool y + +config X86_REQUIRED_FEATURE_NOPL + def_bool y + depends on X86_64 || X86_P6_NOP + +config X86_REQUIRED_FEATURE_CX8 + def_bool y + depends on X86_CX8 + +# this should be set for all -march=.. options where the compiler +# generates cmov. +config X86_REQUIRED_FEATURE_CMOV + def_bool y + depends on X86_CMOV + +# this should be set for all -march= options where the compiler +# generates movbe. +config X86_REQUIRED_FEATURE_MOVBE + def_bool y + depends on MATOM + +config X86_REQUIRED_FEATURE_CPUID + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_UP + def_bool y + depends on !SMP + +config X86_REQUIRED_FEATURE_FPU + def_bool y + depends on !MATH_EMULATION + +config X86_REQUIRED_FEATURE_PAE + def_bool y + depends on X86_64 || X86_PAE + +config X86_REQUIRED_FEATURE_PSE + def_bool y + depends on X86_64 && !PARAVIRT_XXL + +config X86_REQUIRED_FEATURE_PGE + def_bool y + depends on X86_64 && !PARAVIRT_XXL + +config X86_REQUIRED_FEATURE_MSR + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_FXSR + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_XMM + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_XMM2 + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_LM + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_UMIP + def_bool y + depends on !X86_UMIP + +config X86_DISABLED_FEATURE_VME + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_K6_MTRR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_CYRIX_ARR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_CENTAUR_MCR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_PCID + def_bool y + depends on !X86_64 + +config X86_DISABLED_FEATURE_PKU + def_bool y + depends on !X86_INTEL_MEMORY_PROTECTION_KEYS + +config X86_DISABLED_FEATURE_OSPKE + def_bool y + depends on !X86_INTEL_MEMORY_PROTECTION_KEYS + +config X86_DISABLED_FEATURE_LA57 + def_bool y + depends on !X86_5LEVEL + +config X86_DISABLED_FEATURE_PTI + def_bool y + depends on !MITIGATION_PAGE_TABLE_ISOLATION + +config X86_DISABLED_FEATURE_RETPOLINE + def_bool y + depends on !MITIGATION_RETPOLINE + +config X86_DISABLED_FEATURE_RETPOLINE_LFENCE + def_bool y + depends on !MITIGATION_RETPOLINE + +config X86_DISABLED_FEATURE_RETHUNK + def_bool y + depends on !MITIGATION_RETHUNK + +config X86_DISABLED_FEATURE_UNRET + def_bool y + depends on !MITIGATION_UNRET_ENTRY + +config X86_DISABLED_FEATURE_CALL_DEPTH + def_bool y + depends on !MITIGATION_CALL_DEPTH_TRACKING + +config X86_DISABLED_FEATURE_LAM + def_bool y + depends on !ADDRESS_MASKING + +config X86_DISABLED_FEATURE_ENQCMD + def_bool y + depends on !INTEL_IOMMU_SVM + +config X86_DISABLED_FEATURE_SGX + def_bool y + depends on !X86_SGX + +config X86_DISABLED_FEATURE_XENPV + def_bool y + depends on !XEN_PV + +config X86_DISABLED_FEATURE_TDX_GUEST + def_bool y + depends on !INTEL_TDX_GUEST + +config X86_DISABLED_FEATURE_USER_SHSTK + def_bool y + depends on !X86_USER_SHADOW_STACK + +config X86_DISABLED_FEATURE_IBT + def_bool y + depends on !X86_KERNEL_IBT + +config X86_DISABLED_FEATURE_FRED + def_bool y + depends on !X86_FRED + +config X86_DISABLED_FEATURE_SEV_SNP + def_bool y + depends on !KVM_AMD_SEV + +config X86_DISABLED_FEATURE_INVLPGB + def_bool y + depends on !BROADCAST_TLB_FLUSH diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d1..a607769a25afd 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -140,14 +140,7 @@ ifeq ($(CONFIG_X86_32),y) # temporary until string.h is fixed KBUILD_CFLAGS += -ffreestanding - ifeq ($(CONFIG_STACKPROTECTOR),y) - ifeq ($(CONFIG_SMP),y) - KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \ - -mstack-protector-guard-symbol=__ref_stack_chk_guard - else - KBUILD_CFLAGS += -mstack-protector-guard=global - endif - endif + percpu_seg := fs else BITS := 64 UTS_MACHINE := x86_64 @@ -178,25 +171,24 @@ else # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) - cflags-$(CONFIG_MK8) += -march=k8 - cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic - KBUILD_CFLAGS += $(cflags-y) - - rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 - rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona - rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 - rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic - KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -march=x86-64 -mtune=generic + KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel KBUILD_RUSTFLAGS += -Cno-redzone=y KBUILD_RUSTFLAGS += -Ccode-model=kernel + + percpu_seg := gs +endif + +ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) + KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg) + KBUILD_CFLAGS += -mstack-protector-guard-symbol=__ref_stack_chk_guard + else + KBUILD_CFLAGS += -mstack-protector-guard=global + endif endif # @@ -276,6 +268,21 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all +### +# header generation + +cpufeaturemasks.hdr := arch/x86/include/generated/asm/cpufeaturemasks.h +cpufeaturemasks.awk := $(srctree)/arch/x86/tools/cpufeaturemasks.awk +cpufeatures_hdr := $(srctree)/arch/x86/include/asm/cpufeatures.h +targets += $(cpufeaturemasks.hdr) +quiet_cmd_gen_featuremasks = GEN $@ + cmd_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) > $@ + +$(cpufeaturemasks.hdr): $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) FORCE + $(shell mkdir -p $(dir $@)) + $(call if_changed,gen_featuremasks) +archprepare: $(cpufeaturemasks.hdr) + ### # Kernel objects diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 94834c4b5e5ef..af7de9a427523 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -24,7 +24,6 @@ cflags-$(CONFIG_MK6) += -march=k6 # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,9 +31,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 -cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MATOM) += -march=atom # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9cc0ff6e9067d..8589471b65a16 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -35,7 +35,6 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs := tools/build hostprogs += mkcpustr HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ @@ -61,11 +60,9 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -silent_redirect_image = >/dev/null -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(obj)/zoffset.h $@ $($(quiet)redirect_image) + cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@ -$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE +$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')' diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0d37420cad025..1cdcd4aaf3955 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -235,7 +235,7 @@ static void handle_relocations(void *output, unsigned long output_len, /* * Process relocations: 32 bit relocations first then 64 bit after. - * Three sets of binary relocations are added to the end of the kernel + * Two sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -245,8 +245,6 @@ static void handle_relocations(void *output, unsigned long output_len, * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated - * 0 - zero terminator for inverse 32 bit relocations - * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -263,16 +261,6 @@ static void handle_relocations(void *output, unsigned long output_len, *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 - while (*--reloc) { - long extended = *reloc; - extended += map; - - ptr = (unsigned long)extended; - if (ptr < min_addr || ptr > max_addr) - error("inverse 32-bit relocation outside of kernel!\n"); - - *(int32_t *)ptr -= delta; - } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index c882e1f67af01..d8c5de40669d3 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" #include +#include #include #include #include "pgtable.h" @@ -107,6 +108,7 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ + sanitize_boot_params(bp); boot_params_ptr = bp; /* diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 083ec6d7722a1..48d0b51845571 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -48,8 +48,7 @@ SECTIONS *(.data) *(.data.*) - /* Add 4 bytes of extra space for a CRC-32 checksum */ - . = ALIGN(. + 4, 0x200); + . = ALIGN(0x200); _edata = . ; } . = ALIGN(L1_CACHE_BYTES); diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 0aae4d4ed6150..f82de8de5dc62 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -22,10 +22,11 @@ # include "boot.h" #endif #include +#include #include #include -#include #include + #include "string.h" #include "msr.h" diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index d75237ba7ce94..916bac09b464d 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -3,7 +3,6 @@ #include "bitops.h" #include -#include #include #include "cpuflags.h" @@ -29,40 +28,32 @@ static int has_fpu(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +#ifdef CONFIG_X86_32 /* * For building the 16-bit code we want to explicitly specify 32-bit * push/pop operations, rather than just saying 'pushf' or 'popf' and - * letting the compiler choose. But this is also included from the - * compressed/ directory where it may be 64-bit code, and thus needs - * to be 'pushfq' or 'popfq' in that case. + * letting the compiler choose. */ -#ifdef __x86_64__ -#define PUSHF "pushfq" -#define POPF "popfq" -#else -#define PUSHF "pushfl" -#define POPF "popfl" -#endif - -int has_eflag(unsigned long mask) +bool has_eflag(unsigned long mask) { unsigned long f0, f1; - asm volatile(PUSHF " \n\t" - PUSHF " \n\t" + asm volatile("pushfl \n\t" + "pushfl \n\t" "pop %0 \n\t" "mov %0,%1 \n\t" "xor %2,%1 \n\t" "push %1 \n\t" - POPF " \n\t" - PUSHF " \n\t" + "popfl \n\t" + "pushfl \n\t" "pop %1 \n\t" - POPF + "popfl" : "=&r" (f0), "=&r" (f1) : "ri" (mask)); return !!((f0^f1) & mask); } +#endif void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 475b8fde90f7d..a398d9204ad00 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -15,8 +15,13 @@ struct cpu_features { extern struct cpu_features cpu; extern u32 cpu_vendor[3]; -int has_eflag(unsigned long mask); +#ifdef CONFIG_X86_32 +bool has_eflag(unsigned long mask); +#else +static inline bool has_eflag(unsigned long mask) { return true; } +#endif void get_cpuflags(void); void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); +bool has_cpuflag(int flag); #endif diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index c9299aeb7333e..3882ead513f74 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -22,6 +22,7 @@ # This script requires: # bash # syslinux +# genisoimage # mtools (for fdimage* and hdimage) # edk2/OVMF (for hdimage) # @@ -251,7 +252,9 @@ geniso() { cp "$isolinux" "$ldlinux" "$tmp_dir" cp "$FBZIMAGE" "$tmp_dir"/linux echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg - cp "${FDINITRDS[@]}" "$tmp_dir"/ + if [ ${#FDINITRDS[@]} -gt 0 ]; then + cp "${FDINITRDS[@]}" "$tmp_dir"/ + fi genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ -quiet -o "$FIMAGE" -b isolinux.bin \ -c boot.cat -no-emul-boot -boot-load-size 4 \ diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index da0ccc5de5380..22d730b227e38 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -12,8 +12,6 @@ #include -#include "../include/asm/required-features.h" -#include "../include/asm/disabled-features.h" #include "../include/asm/cpufeatures.h" #include "../include/asm/vmxfeatures.h" #include "../kernel/cpu/capflags.c" @@ -23,6 +21,7 @@ int main(void) int i, j; const char *str; + printf("#include \n\n"); printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 3a2d1360abb01..e1d594a602045 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -45,6 +45,8 @@ SECTIONS setup_size = ALIGN(ABSOLUTE(.), 4096); setup_sects = ABSOLUTE(setup_size / 512); + ASSERT(setup_sects >= 5, "The setup must be at least 5 sectors in size"); + ASSERT(setup_sects <= 64, "The setup must be at most 64 sectors in size"); } . = ALIGN(16); diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore deleted file mode 100644 index ae91f4d0d78b5..0000000000000 --- a/arch/x86/boot/tools/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -build diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c deleted file mode 100644 index 10311d77c67f8..0000000000000 --- a/arch/x86/boot/tools/build.c +++ /dev/null @@ -1,247 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1997 Martin Mares - * Copyright (C) 2007 H. Peter Anvin - */ - -/* - * This file builds a disk-image from three different files: - * - * - setup: 8086 machine code, sets up system parm - * - system: 80386 code for actual system - * - zoffset.h: header with ZO_* defines - * - * It does some checking that all files are of the correct type, and writes - * the result to the specified destination, removing headers and padding to - * the right amount. It also writes some system data to stdout. - */ - -/* - * Changes by tytso to allow root device specification - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - * Cross compiling fixes by Gertjan van Wingerde, July 1996 - * Rewritten by Martin Mares, April 1997 - * Substantially overhauled by H. Peter Anvin, April 2007 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; - -/* Minimal number of setup sectors */ -#define SETUP_SECT_MIN 5 -#define SETUP_SECT_MAX 64 - -/* This must be large enough to hold the entire setup */ -u8 buf[SETUP_SECT_MAX*512]; - -static unsigned long _edata; - -/*----------------------------------------------------------------------*/ - -static const u32 crctab32[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, - 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, - 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, - 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, - 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, - 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, - 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, - 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, - 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, - 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, - 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, - 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, - 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, - 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, - 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, - 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, - 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, - 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, - 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, - 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, - 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, - 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, - 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, - 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, - 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, - 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, - 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, - 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, - 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, - 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, - 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, - 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, - 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, - 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, - 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, - 0x2d02ef8d -}; - -static u32 partial_crc32_one(u8 c, u32 crc) -{ - return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8); -} - -static u32 partial_crc32(const u8 *s, int len, u32 crc) -{ - while (len--) - crc = partial_crc32_one(*s++, crc); - return crc; -} - -static void die(const char * str, ...) -{ - va_list args; - va_start(args, str); - vfprintf(stderr, str, args); - va_end(args); - fputc('\n', stderr); - exit(1); -} - -static void usage(void) -{ - die("Usage: build setup system zoffset.h image"); -} - -/* - * Parse zoffset.h and find the entry points. We could just #include zoffset.h - * but that would mean tools/build would have to be rebuilt every time. It's - * not as if parsing it is hard... - */ -#define PARSE_ZOFS(p, sym) do { \ - if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \ - sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \ -} while (0) - -static void parse_zoffset(char *fname) -{ - FILE *file; - char *p; - int c; - - file = fopen(fname, "r"); - if (!file) - die("Unable to open `%s': %m", fname); - c = fread(buf, 1, sizeof(buf) - 1, file); - if (ferror(file)) - die("read-error on `zoffset.h'"); - fclose(file); - buf[c] = 0; - - p = (char *)buf; - - while (p && *p) { - PARSE_ZOFS(p, _edata); - - p = strchr(p, '\n'); - while (p && (*p == '\r' || *p == '\n')) - p++; - } -} - -int main(int argc, char ** argv) -{ - unsigned int i, sz, setup_sectors; - int c; - struct stat sb; - FILE *file, *dest; - int fd; - void *kernel; - u32 crc = 0xffffffffUL; - - if (argc != 5) - usage(); - parse_zoffset(argv[3]); - - dest = fopen(argv[4], "w"); - if (!dest) - die("Unable to write `%s': %m", argv[4]); - - /* Copy the setup code */ - file = fopen(argv[1], "r"); - if (!file) - die("Unable to open `%s': %m", argv[1]); - c = fread(buf, 1, sizeof(buf), file); - if (ferror(file)) - die("read-error on `setup'"); - if (c < 1024) - die("The setup must be at least 1024 bytes"); - if (get_unaligned_le16(&buf[510]) != 0xAA55) - die("Boot block hasn't got boot flag (0xAA55)"); - fclose(file); - - /* Pad unused space with zeros */ - setup_sectors = (c + 4095) / 4096; - setup_sectors *= 8; - if (setup_sectors < SETUP_SECT_MIN) - setup_sectors = SETUP_SECT_MIN; - i = setup_sectors*512; - memset(buf+c, 0, i-c); - - /* Open and stat the kernel file */ - fd = open(argv[2], O_RDONLY); - if (fd < 0) - die("Unable to open `%s': %m", argv[2]); - if (fstat(fd, &sb)) - die("Unable to stat `%s': %m", argv[2]); - if (_edata != sb.st_size) - die("Unexpected file size `%s': %u != %u", argv[2], _edata, - sb.st_size); - sz = _edata - 4; - kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); - if (kernel == MAP_FAILED) - die("Unable to mmap '%s': %m", argv[2]); - - crc = partial_crc32(buf, i, crc); - if (fwrite(buf, 1, i, dest) != i) - die("Writing setup failed"); - - /* Copy the kernel code */ - crc = partial_crc32(kernel, sz, crc); - if (fwrite(kernel, 1, sz, dest) != sz) - die("Writing kernel failed"); - - /* Write the CRC */ - put_unaligned_le32(crc, buf); - if (fwrite(buf, 1, 4, dest) != 4) - die("Writing CRC failed"); - - /* Catch any delayed write failures */ - if (fclose(dest)) - die("Writing image failed"); - - close(fd); - - /* Everything is OK */ - return 0; -} diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 82492efc5d949..96c7bc698e6b6 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -2853,19 +2853,8 @@ struct snp_msg_desc *snp_msg_alloc(void) if (!mdesc->response) goto e_free_request; - mdesc->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE); - if (!mdesc->certs_data) - goto e_free_response; - - /* initial the input address for guest request */ - mdesc->input.req_gpa = __pa(mdesc->request); - mdesc->input.resp_gpa = __pa(mdesc->response); - mdesc->input.data_gpa = __pa(mdesc->certs_data); - return mdesc; -e_free_response: - free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); e_free_request: free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); e_unmap: @@ -2885,7 +2874,6 @@ void snp_msg_free(struct snp_msg_desc *mdesc) kfree(mdesc->ctx); free_shared_pages(mdesc->response, sizeof(struct snp_guest_msg)); free_shared_pages(mdesc->request, sizeof(struct snp_guest_msg)); - free_shared_pages(mdesc->certs_data, SEV_FW_BLOB_MAX_SIZE); iounmap((__force void __iomem *)mdesc->secrets); memset(mdesc, 0, sizeof(*mdesc)); @@ -3054,7 +3042,7 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r * sequence number must be incremented or the VMPCK must be deleted to * prevent reuse of the IV. */ - rc = snp_issue_guest_request(req, &mdesc->input, rio); + rc = snp_issue_guest_request(req, &req->input, rio); switch (rc) { case -ENOSPC: /* @@ -3064,7 +3052,7 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r * order to increment the sequence number and thus avoid * IV reuse. */ - override_npages = mdesc->input.data_npages; + override_npages = req->input.data_npages; req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; /* @@ -3120,7 +3108,7 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r } if (override_npages) - mdesc->input.data_npages = override_npages; + req->input.data_npages = override_npages; return rc; } @@ -3158,6 +3146,11 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req */ memcpy(mdesc->request, &mdesc->secret_request, sizeof(mdesc->secret_request)); + /* Initialize the input address for guest request */ + req->input.req_gpa = __pa(mdesc->request); + req->input.resp_gpa = __pa(mdesc->response); + req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0; + rc = __handle_guest_request(mdesc, req, rio); if (rc) { if (rc == -EIO && diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index 581296255b39e..d5d091e03bd34 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -1,6 +1,4 @@ # global x86 required specific stuff -# On 32-bit HIGHMEM4G is not allowed -CONFIG_HIGHMEM64G=y CONFIG_64BIT=y # These enable us to allow some of the diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index eb153eff9331a..b37881bb9f157 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -17,6 +17,7 @@ */ #include +#include #include #define STATE1 %xmm0 @@ -1071,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc) * size_t len, u8 *iv) */ SYM_FUNC_START(aesni_ctr_enc) + ANNOTATE_NOENDBR FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 646477a13e110..1dfef28c12663 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include +#include #include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) jmp .Ldec_max24; SYM_FUNC_END(__camellia_dec_blk16) -SYM_FUNC_START(camellia_ecb_enc_16way) +SYM_TYPED_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) RET; SYM_FUNC_END(camellia_ecb_enc_16way) -SYM_FUNC_START(camellia_ecb_dec_16way) +SYM_TYPED_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) RET; SYM_FUNC_END(camellia_ecb_dec_16way) -SYM_FUNC_START(camellia_cbc_dec_16way) +SYM_TYPED_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index a0eb94e53b1bb..b1c9b94505557 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -6,6 +6,7 @@ */ #include +#include #include #define CAMELLIA_TABLE_BYTE_LEN 272 diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 816b6bb8bded7..824cb94de6c25 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include +#include .file "camellia-x86_64-asm_64.S" .text @@ -177,7 +178,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -SYM_FUNC_START(__camellia_enc_blk) +SYM_TYPED_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk) RET; SYM_FUNC_END(__camellia_enc_blk) -SYM_FUNC_START(camellia_dec_blk) +SYM_TYPED_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -SYM_FUNC_START(__camellia_enc_blk_2way) +SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way) RET; SYM_FUNC_END(__camellia_enc_blk_2way) -SYM_FUNC_START(camellia_dec_blk_2way) +SYM_TYPED_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 97e2836218518..84e47f7f6188c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -9,6 +9,7 @@ */ #include +#include #include #include "glue_helper-asm-avx.S" @@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) RET; SYM_FUNC_END(__serpent_dec_blk8_avx) -SYM_FUNC_START(serpent_ecb_enc_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) -SYM_FUNC_START(serpent_ecb_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) -SYM_FUNC_START(serpent_cbc_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index d2288bf38a8a5..071e90e7f0d8f 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -6,6 +6,7 @@ */ #include +#include .file "twofish-x86_64-asm-3way.S" .text @@ -220,7 +221,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -SYM_FUNC_START(__twofish_enc_blk_3way) +SYM_TYPED_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) RET; SYM_FUNC_END(__twofish_enc_blk_3way) -SYM_FUNC_START(twofish_dec_blk_3way) +SYM_TYPED_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 775af290cd196..e08b4ba07b934 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -8,6 +8,7 @@ .text #include +#include #include #define a_offset 0 @@ -202,7 +203,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -SYM_FUNC_START(twofish_enc_blk) +SYM_TYPED_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk) RET SYM_FUNC_END(twofish_enc_blk) -SYM_FUNC_START(twofish_dec_blk) +SYM_TYPED_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index ea81770629eea..cb0911c5dc5dd 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -431,6 +431,7 @@ For 32-bit we have the following conventions - kernel is built with /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func SYM_FUNC_START(\name) + ANNOTATE_NOENDBR pushq %rbp movq %rsp, %rbp diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 94941c5a10ac1..3514bf2978eed 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -142,7 +142,7 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs) #ifdef CONFIG_IA32_EMULATION bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); -static int ia32_emulation_override_cmdline(char *arg) +static int __init ia32_emulation_override_cmdline(char *arg) { return kstrtobool(arg, &__ia32_enabled); } @@ -190,6 +190,7 @@ static __always_inline bool int80_is_external(void) /** * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index b7ea3e8e9eccd..088f91f76edb3 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,7 @@ .pushsection .noinstr.text, "ax" SYM_FUNC_START(entry_ibpb) + ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx movl $PRED_CMD_IBPB, %eax xorl %edx, %edx @@ -52,7 +54,6 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); THUNK warn_thunk_thunk, __warn_thunk -#ifndef CONFIG_X86_64 /* * Clang's implementation of TLS stack cookies requires the variable in * question to be a TLS variable. If the variable happens to be defined as an @@ -66,4 +67,3 @@ THUNK warn_thunk_thunk, __warn_thunk #ifdef CONFIG_STACKPROTECTOR EXPORT_SYMBOL(__ref_stack_chk_guard); #endif -#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 20be5758c2d2e..92c0b4a94e0ad 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1153,7 +1153,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1217,7 +1217,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f52dbe0ad93cd..f40bdf97d390a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -175,6 +175,7 @@ SYM_CODE_END(entry_SYSCALL_64) */ .pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) + ANNOTATE_NOENDBR /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -192,7 +193,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) + movq %rbx, PER_CPU_VAR(__stack_chk_guard) #endif /* @@ -742,6 +743,7 @@ _ASM_NOKPROBE(common_interrupt_return) * Is in entry.text as it shouldn't be instrumented. */ SYM_FUNC_START(asm_load_gs_index) + ANNOTATE_NOENDBR FRAME_BEGIN swapgs .Lgs_change: @@ -1166,7 +1168,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1484,7 +1486,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS @@ -1526,6 +1528,7 @@ SYM_CODE_END(rewind_stack_and_make_dead) * refactored in the future if needed. */ SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR push %rbp mov %rsp, %rbp movl $5, %ecx diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ed0a5f2dc1297..a45e1125fc6cf 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,7 +57,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ @@ -193,7 +193,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index a02bc6f3d2e6a..29c5c32c16c36 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -58,6 +58,7 @@ SYM_CODE_END(asm_fred_entrypoint_kernel) #if IS_ENABLED(CONFIG_KVM_INTEL) SYM_FUNC_START(asm_fred_entry_from_kvm) + ANNOTATE_NOENDBR push %rbp mov %rsp, %rbp diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4d0fb2fba7e20..62283097a7cc7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -396,7 +396,7 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx -384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +384 i386 arch_prctl sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents 386 i386 rseq sys_rseq 393 i386 semget sys_semget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5eb708bff1c79..88e388c7675b7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c9216ac4fb1eb..54d3e9774d620 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -4,7 +4,7 @@ # # Include the generic Makefile to check the built vDSO: -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Files to link into the vDSO: vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o @@ -32,7 +32,7 @@ targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \ -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE @@ -133,6 +133,7 @@ KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += -DBUILD_VDSO ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) @@ -151,10 +152,9 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(src)/checkundef.sh '$(NM)' '$@' + -T $(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack quiet_cmd_vdso_and_check = VDSO $@ diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh deleted file mode 100755 index 7ee90a9b549da..0000000000000 --- a/arch/x86/entry/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 872947c1004c3..ec1ac191a0578 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include /* * Linker script for vDSO. This is an ELF shared object prelinked to @@ -17,14 +18,9 @@ SECTIONS * segment. */ - vvar_start = . - __VVAR_PAGES * PAGE_SIZE; - vvar_page = vvar_start; + VDSO_VVAR_SYMS - vdso_rng_data = vvar_page + __VDSO_RND_DATA_OFFSET; - - timens_page = vvar_start + PAGE_SIZE; - - vclock_pages = vvar_start + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE; + vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data); pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE; hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 90d15f2a72055..f84e8f8fa5fe6 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -69,33 +69,12 @@ const char *outfilename; -/* Symbols that we need in vdso2c. */ -enum { - sym_vvar_start, - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - -const int special_pages[] = { - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - struct vdso_sym { const char *name; bool export; }; struct vdso_sym required_syms[] = { - [sym_vvar_start] = {"vvar_start", true}, - [sym_vvar_page] = {"vvar_page", true}, - [sym_pvclock_page] = {"pvclock_page", true}, - [sym_hvclock_page] = {"hvclock_page", true}, - [sym_timens_page] = {"timens_page", true}, {"VDSO32_NOTE_MASK", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 67b3e37576a64..78ed1c1f28b92 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,26 +150,6 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } } - /* Validate mapping addresses. */ - for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - INT_BITS symval = syms[special_pages[i]]; - - if (!symval) - continue; /* The mapping isn't used; ignore it. */ - - if (symval % 4096) - fail("%s must be a multiple of 4096\n", - required_syms[i].name); - if (symval + 4096 < syms[sym_vvar_start]) - fail("%s underruns vvar_start\n", - required_syms[i].name); - if (symval + 4096 > 0) - fail("%s is on the wrong side of the vdso text\n", - required_syms[i].name); - } - if (syms[sym_vvar_start] % 4096) - fail("vvar_begin must be a multiple of 4096\n"); - if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 39e6efc1a9cab..9518bf1ddf358 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -27,13 +27,7 @@ #include #include -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)vvar_page; -} - -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; +static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES); unsigned int vclocks_used __read_mostly; @@ -48,13 +42,11 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len), - NULL); + image->alt_len)); return 0; } -static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -98,99 +90,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -#ifdef CONFIG_TIME_NS -/* - * The vvar page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_mapping)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - const struct vdso_image *image = vma->vm_mm->context.vdso_image; - unsigned long pfn; - long sym_offset; - - if (!image) - return VM_FAULT_SIGBUS; - - sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + - image->sym_vvar_start; - - /* - * Sanity check: a symbol offset of zero means that the page - * does not exist for this vdso image, not that the page is at - * offset zero relative to the text mapping. This should be - * impossible here, because sym_offset should only be zero for - * the page past the end of the vvar mapping. - */ - if (sym_offset == 0) - return VM_FAULT_SIGBUS; - - if (sym_offset == image->sym_vvar_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; - - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the sym_vvar_page offset and - * the real VVAR page is mapped with the sym_timens_page - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (timens_page) { - unsigned long addr; - vm_fault_t err; - - /* - * Optimization: inside time namespace pre-fault - * VVAR page too. As on timens page there are only - * offsets for clocks on VVAR, it'll be faulted - * shortly by VDSO code. - */ - addr = vmf->address + (image->sym_timens_page - sym_offset); - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - - pfn = page_to_pfn(timens_page); - } - - return vmf_insert_pfn(vma, vmf->address, pfn); - - } else if (sym_offset == image->sym_timens_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - if (!timens_page) - return VM_FAULT_SIGBUS; - - pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; - return vmf_insert_pfn(vma, vmf->address, pfn); - } - - return VM_FAULT_SIGBUS; -} - static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -212,7 +111,6 @@ static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, case VDSO_PAGE_HVCLOCK_OFFSET: { unsigned long pfn = hv_get_tsc_pfn(); - if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, pfn); break; @@ -228,10 +126,6 @@ static const struct vm_special_mapping vdso_mapping = { .fault = vdso_fault, .mremap = vdso_mremap, }; -static const struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; static const struct vm_special_mapping vvar_vclock_mapping = { .name = "[vvar_vclock]", .fault = vvar_vclock_fault, @@ -253,13 +147,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) return -EINTR; addr = get_unmapped_area(NULL, addr, - image->size - image->sym_vvar_start, 0, 0); + image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - text_start = addr - image->sym_vvar_start; + text_start = addr + __VDSO_PAGES * PAGE_SIZE; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -276,13 +170,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) goto up_fail; } - vma = _install_special_mapping(mm, - addr, - (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); - + vma = vdso_install_vvar_mapping(mm, addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); @@ -290,7 +178,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } vma = _install_special_mapping(mm, - addr + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, + VDSO_VCLOCK_PAGES_START(addr), VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| VM_PFNMAP, @@ -327,7 +215,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || - vma_is_special_mapping(vma, &vvar_mapping) || + vma_is_special_mapping(vma, &vdso_vvar_mapping) || vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e7a8b8758e088..7b52b8e3a1851 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,9 +28,6 @@ static u32 ibs_caps; #include #include -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - /* attr.config2 */ #define IBS_SW_FILTER_MASK 1 @@ -89,6 +86,7 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u16 min_period; u64 max_period; unsigned long offset_mask[1]; int offset_max; @@ -270,11 +268,19 @@ static int validate_group(struct perf_event *event) return 0; } +static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + return perf_ibs == &perf_ibs_op && + (ibs_caps & IBS_CAPS_OPLDLAT) && + (event->attr.config1 & 0xFFF); +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; - u64 max_cnt, config; + u64 config; int ret; perf_ibs = get_ibs_pmu(event->attr.type); @@ -310,25 +316,47 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; + + if (event->attr.freq) { + hwc->sample_period = perf_ibs->min_period; + } else { + /* Silently mask off lower nibble. IBS hw mandates it. */ + hwc->sample_period &= ~0x0FULL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } } else { - max_cnt = config & perf_ibs->cnt_mask; + u64 period = 0; + + if (event->attr.freq) + return -EINVAL; + + if (perf_ibs == &perf_ibs_op) { + period = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + period |= config & IBS_OP_MAX_CNT_EXT_MASK; + } else { + period = (config & IBS_FETCH_MAX_CNT) << 4; + } + config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; + event->attr.sample_period = period; + hwc->sample_period = period; + + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; } - if (!hwc->sample_period) - return -EINVAL; + if (perf_ibs_ldlat_event(perf_ibs, event)) { + u64 ldlat = event->attr.config1 & 0xFFF; + + if (ldlat < 128 || ldlat > 2048) + return -EINVAL; + ldlat >>= 7; + + config |= (ldlat - 1) << 59; + config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; + } /* * If we modify hwc->sample_period, we also need to update @@ -349,7 +377,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, int overflow; /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, perf_ibs->min_period, + perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); return overflow; @@ -447,6 +476,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { config |= period & IBS_OP_MAX_CNT_EXT_MASK; @@ -554,6 +586,28 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +static int perf_ibs_check_period(struct perf_event *event, u64 value) +{ + struct perf_ibs *perf_ibs; + u64 low_nibble; + + if (event->attr.freq) + return 0; + + perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + low_nibble = value & 0xFULL; + + /* + * This contradicts with perf_ibs_init() which allows sample period + * with lower nibble bits set but silently masks them off. Whereas + * this returns error. + */ + if (low_nibble || value < perf_ibs->min_period) + return -EINVAL; + + return 0; +} + /* * We need to initialize with empty group if all attributes in the * group are dynamic. @@ -572,7 +626,10 @@ PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); +PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -580,6 +637,18 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } +static umode_t +ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; +} + +static umode_t +ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; +} + static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, &format_attr_swfilt.attr, @@ -596,6 +665,16 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_cap_attrs[] = { + &ibs_op_ldlat_cap.attr.attr, + NULL, +}; + +static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { + &ibs_op_dtlb_pgsize_cap.attr.attr, + NULL, +}; + static struct attribute_group group_fetch_formats = { .name = "format", .attrs = fetch_attrs, @@ -613,6 +692,18 @@ static struct attribute_group group_zen4_ibs_extensions = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_cap = { + .name = "caps", + .attrs = ibs_op_ldlat_cap_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + +static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { + .name = "caps", + .attrs = ibs_op_dtlb_pgsize_cap_attrs, + .is_visible = ibs_op_dtlb_pgsize_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_fetch_formats, &empty_caps_group, @@ -651,6 +742,11 @@ static struct attribute_group group_op_formats = { .attrs = op_attrs, }; +static struct attribute *ibs_op_ldlat_format_attrs[] = { + &ibs_op_ldlat_format.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -669,10 +765,19 @@ static const struct attribute_group *op_attr_groups[] = { NULL, }; +static struct attribute_group group_ibs_op_ldlat_format = { + .name = "format", + .attrs = ibs_op_ldlat_format_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, &group_zen4_ibs_extensions, + &group_ibs_op_ldlat_cap, + &group_ibs_op_ldlat_format, + &group_ibs_op_dtlb_pgsize_cap, NULL, }; @@ -686,12 +791,14 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, + .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .min_period = 0x10, .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, @@ -709,13 +816,15 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, + .config_mask = IBS_OP_MAX_CNT, .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .min_period = 0x90, .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, @@ -917,6 +1026,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, if (!op_data3->dc_lin_addr_valid) return; + if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && + !op_data3->dc_phy_addr_valid) + return; + if (!op_data3->dc_l1tlb_miss) { data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; return; @@ -1021,15 +1134,25 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } } -static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, +static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return perf_ibs == &perf_ibs_op && + sample_type & (PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_WEIGHT_TYPE | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_PHYS_ADDR); +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, + struct perf_event *event, int check_rip) { - if (sample_type & PERF_SAMPLE_RAW || - (perf_ibs == &perf_ibs_op && - (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR || - sample_type & PERF_SAMPLE_PHYS_ADDR))) + if (event->attr.sample_type & PERF_SAMPLE_RAW || + perf_ibs_is_mem_sample_type(perf_ibs, event) || + perf_ibs_ldlat_event(perf_ibs, event)) return perf_ibs->offset_max; else if (check_rip) return 3; @@ -1084,7 +1207,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { rdmsrl(msr + offset, *buf++); @@ -1093,6 +1216,22 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + + if (perf_ibs_ldlat_event(perf_ibs, event)) { + union ibs_op_data3 op_data3; + + op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + /* + * Opening event is errored out if load latency threshold is + * outside of [128, 2048] range. Since the event has reached + * interrupt handler, we can safely assume the threshold is + * within [128, 2048] range. + */ + if (!op_data3.ld_op || !op_data3.dc_miss || + op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) + goto out; + } + /* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. @@ -1155,6 +1294,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); + + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + out: if (throttle) { perf_ibs_stop(event, 0); @@ -1244,7 +1387,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b15f7b950d2e0..f8228d8243f7d 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -30,7 +30,7 @@ #define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) #define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -#define IOMMU_NAME_SIZE 16 +#define IOMMU_NAME_SIZE 24 struct perf_amd_iommu { struct list_head list; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f218ac0d445c..c76bab97cc68d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); +DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -628,7 +630,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (event->attr.sample_period && x86_pmu.limit_period) { + if (!event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) @@ -1298,6 +1300,15 @@ static void x86_pmu_enable(struct pmu *pmu) if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; + + /* + * The late setup (after counters are scheduled) + * is required for some cases, e.g., PEBS counters + * snapshotting. Because an accurate counter index + * is needed. + */ + static_call_cond(x86_pmu_late_setup)(); + /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -2035,6 +2046,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); + + static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) @@ -2844,7 +2857,7 @@ static bool is_uprobe_at_func_entry(struct pt_regs *regs) return true; /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 8f78b0c900ef4..39a987d5eb6e0 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -36,7 +36,7 @@ enum { BTS_STATE_ACTIVE, }; -static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); +static struct bts_ctx __percpu *bts_ctx; #define BTS_RECORD_SIZE 24 #define BTS_SAFETY_MARGIN 4080 @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[]; + struct bts_phys buf[] __counted_by(nr_bufs); }; static struct pmu bts_pmu; @@ -231,7 +231,7 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); static void __bts_event_start(struct perf_event *event) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf = perf_get_aux(&bts->handle); u64 config = 0; @@ -260,7 +260,7 @@ static void __bts_event_start(struct perf_event *event) static void bts_event_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf; buf = perf_aux_output_begin(&bts->handle, event); @@ -290,7 +290,7 @@ static void bts_event_start(struct perf_event *event, int flags) static void __bts_event_stop(struct perf_event *event, int state) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ WRITE_ONCE(bts->state, state); @@ -305,7 +305,7 @@ static void __bts_event_stop(struct perf_event *event, int state) static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf = NULL; int state = READ_ONCE(bts->state); @@ -338,9 +338,14 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - int state = READ_ONCE(bts->state); + struct bts_ctx *bts; + int state; + + if (!bts_ctx) + return; + bts = this_cpu_ptr(bts_ctx); + state = READ_ONCE(bts->state); /* * Here we transition from INACTIVE to ACTIVE; * if we instead are STOPPED from the interrupt handler, @@ -358,7 +363,12 @@ void intel_bts_enable_local(void) void intel_bts_disable_local(void) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts; + + if (!bts_ctx) + return; + + bts = this_cpu_ptr(bts_ctx); /* * Here we transition from ACTIVE to INACTIVE; @@ -450,12 +460,17 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct perf_event *event = bts->handle.event; + struct bts_ctx *bts; + struct perf_event *event; struct bts_buffer *buf; s64 old_head; int err = -ENOSPC, handled = 0; + if (!bts_ctx) + return 0; + + bts = this_cpu_ptr(bts_ctx); + event = bts->handle.event; /* * The only surefire way of knowing if this NMI is ours is by checking * the write ptr against the PMI threshold. @@ -518,7 +533,7 @@ static void bts_event_del(struct perf_event *event, int mode) static int bts_event_add(struct perf_event *event, int mode) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -605,6 +620,10 @@ static __init int bts_init(void) return -ENODEV; } + bts_ctx = alloc_percpu(struct bts_ctx); + if (!bts_ctx) + return -ENOMEM; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e86333eee2668..a5778c7d87544 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -397,34 +397,28 @@ static struct event_constraint intel_lnc_event_constraints[] = { METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x012a, 0xf), + INTEL_UEVENT_CONSTRAINT(0x012b, 0xf), INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), - /* - * Generally event codes < 0x90 are restricted to counters 0-3. - * The 0x2E and 0x3C are exception, which has no restriction. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), - INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc), INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), - INTEL_EVENT_CONSTRAINT(0xce, 0x1), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), - /* - * Generally event codes >= 0x90 are likely to have no restrictions. - * The exception are defined as above. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf), EVENT_CONSTRAINT_END }; @@ -2720,7 +2714,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots, * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2728,13 +2722,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) bool reset = true; int idx; - /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); - if (!slots) - return 0; + if (!val) { + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; - /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + } else { + slots = val[0]; + metrics = val[1]; + /* + * Don't reset the PERF_METRICS and Fixed counter 3 + * for each PEBS record read. Utilize the RDPMC metrics + * clear mode. + */ + reset = false; + } for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) @@ -2777,36 +2782,47 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) return slots; } -static u64 icl_update_topdown_event(struct perf_event *event) +static u64 icl_update_topdown_event(struct perf_event *event, u64 *val) { return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + - x86_pmu.num_topdown_events - 1); + x86_pmu.num_topdown_events - 1, + val); } -DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); -static void intel_pmu_read_topdown_event(struct perf_event *event) +static void intel_pmu_read_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) || + is_pebs_counter_event_group(event)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + bool pmu_enabled = cpuc->enabled; - /* Only need to call update_topdown_event() once for group read. */ - if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && - !is_slots_event(event)) - return; + /* Only need to call update_topdown_event() once for group read. */ + if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ)) + return; - perf_pmu_disable(event->pmu); - static_call(intel_pmu_update_topdown_event)(event); - perf_pmu_enable(event->pmu); -} + cpuc->enabled = 0; + if (pmu_enabled) + intel_pmu_disable_all(); -static void intel_pmu_read_event(struct perf_event *event) -{ - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event)) - intel_pmu_read_topdown_event(event); - else - x86_perf_event_update(event); + /* + * If the PEBS counters snapshotting is enabled, + * the topdown event is available in PEBS records. + */ + if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + static_call(intel_pmu_update_topdown_event)(event, NULL); + else + intel_pmu_drain_pebs_buffer(); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + intel_pmu_enable_all(0); + + return; + } + + x86_perf_event_update(event); } static void intel_pmu_enable_fixed(struct perf_event *event) @@ -2938,7 +2954,7 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { if (unlikely(is_topdown_count(event))) - return static_call(intel_pmu_update_topdown_event)(event); + return static_call(intel_pmu_update_topdown_event)(event, NULL); return x86_perf_event_update(event); } @@ -3076,7 +3092,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* @@ -3104,7 +3120,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - static_call(intel_pmu_update_topdown_event)(NULL); + static_call(intel_pmu_update_topdown_event)(NULL, NULL); } /* @@ -3122,6 +3138,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* + * There may be unprocessed PEBS records in the PEBS buffer, + * which still stores the previous values. + * Process those records first before handling the latest value. + * For example, + * A is a regular counter + * B is a PEBS event which reads A + * C is a PEBS event + * + * The following can happen: + * B-assist A=1 + * C A=2 + * B-assist A=3 + * A-overflow-PMI A=4 + * C-assist-PMI (PEBS buffer) A=5 + * + * The PEBS buffer has to be drained before handling the A-PMI + */ + if (is_pebs_counter_event_group(event)) + x86_pmu.drain_pebs(regs, &data); + if (!intel_pmu_save_and_restart(event)) continue; @@ -3958,6 +3995,85 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) return test_bit(idx, (unsigned long *)&intel_cap->capabilities); } +static u64 intel_pmu_freq_start_period(struct perf_event *event) +{ + int type = event->attr.type; + u64 config, factor; + s64 start; + + /* + * The 127 is the lowest possible recommended SAV (sample after value) + * for a 4000 freq (default freq), according to the event list JSON file. + * Also, assume the workload is idle 50% time. + */ + factor = 64 * 4000; + if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE) + goto end; + + /* + * The estimation of the start period in the freq mode is + * based on the below assumption. + * + * For a cycles or an instructions event, 1GHZ of the + * underlying platform, 1 IPC. The workload is idle 50% time. + * The start period = 1,000,000,000 * 1 / freq / 2. + * = 500,000,000 / freq + * + * Usually, the branch-related events occur less than the + * instructions event. According to the Intel event list JSON + * file, the SAV (sample after value) of a branch-related event + * is usually 1/4 of an instruction event. + * The start period of branch-related events = 125,000,000 / freq. + * + * The cache-related events occurs even less. The SAV is usually + * 1/20 of an instruction event. + * The start period of cache-related events = 25,000,000 / freq. + */ + config = event->attr.config & PERF_HW_EVENT_MASK; + if (type == PERF_TYPE_HARDWARE) { + switch (config) { + case PERF_COUNT_HW_CPU_CYCLES: + case PERF_COUNT_HW_INSTRUCTIONS: + case PERF_COUNT_HW_BUS_CYCLES: + case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: + case PERF_COUNT_HW_STALLED_CYCLES_BACKEND: + case PERF_COUNT_HW_REF_CPU_CYCLES: + factor = 500000000; + break; + case PERF_COUNT_HW_BRANCH_INSTRUCTIONS: + case PERF_COUNT_HW_BRANCH_MISSES: + factor = 125000000; + break; + case PERF_COUNT_HW_CACHE_REFERENCES: + case PERF_COUNT_HW_CACHE_MISSES: + factor = 25000000; + break; + default: + goto end; + } + } + + if (type == PERF_TYPE_HW_CACHE) + factor = 25000000; +end: + /* + * Usually, a prime or a number with less factors (close to prime) + * is chosen as an SAV, which makes it less likely that the sampling + * period synchronizes with some periodic event in the workload. + * Minus 1 to make it at least avoiding values near power of twos + * for the default freq. + */ + start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1; + + if (start > x86_pmu.max_period) + start = x86_pmu.max_period; + + if (x86_pmu.limit_period) + x86_pmu.limit_period(event, &start); + + return start; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3969,6 +4085,12 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.freq && event->attr.sample_freq) { + event->hw.sample_period = intel_pmu_freq_start_period(event); + event->hw.last_period = event->hw.sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + } + if (event->attr.precise_ip) { if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; @@ -4069,6 +4191,13 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } + if ((event->attr.sample_type & PERF_SAMPLE_READ) && + (x86_pmu.intel_cap.pebs_format >= 6) && + x86_pmu.intel_cap.pebs_baseline && + is_sampling_event(event) && + event->attr.precise_ip) + event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -4606,9 +4735,9 @@ static int adl_hw_config(struct perf_event *event) return -EOPNOTSUPP; } -static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) +static enum intel_cpu_type adl_get_hybrid_cpu_type(void) { - return HYBRID_INTEL_CORE; + return INTEL_CPU_TYPE_CORE; } static inline bool erratum_hsw11(struct perf_event *event) @@ -4953,7 +5082,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - u8 cpu_type = get_this_hybrid_cpu_type(); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + enum intel_cpu_type cpu_type = c->topo.intel_type; int i; /* @@ -4962,7 +5092,7 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) * on it. There should be a fixup function provided for these * troublesome CPUs (->get_hybrid_cpu_type). */ - if (cpu_type == HYBRID_INTEL_NONE) { + if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) { if (x86_pmu.get_hybrid_cpu_type) cpu_type = x86_pmu.get_hybrid_cpu_type(); else @@ -4979,16 +5109,16 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; u32 native_id; - if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) + if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; - if (cpu_type == HYBRID_INTEL_ATOM) { + if (cpu_type == INTEL_CPU_TYPE_ATOM) { if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - native_id = get_this_hybrid_cpu_native_id(); - if (native_id == skt_native_id && pmu_type == hybrid_small) + native_id = c->topo.intel_native_model_id; + if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - if (native_id == cmt_native_id && pmu_type == hybrid_tiny) + if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny) return &x86_pmu.hybrid_pmu[i]; } } @@ -6617,7 +6747,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_AIRMONT_MID: + case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c2e2eae7309c3..1f7e1a692a7a4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -953,11 +953,11 @@ int intel_pmu_drain_bts_buffer(void) return 1; } -static inline void intel_pmu_drain_pebs_buffer(void) +void intel_pmu_drain_pebs_buffer(void) { struct perf_sample_data data; - x86_pmu.drain_pebs(NULL, &data); + static_call(x86_pmu_drain_pebs)(NULL, &data); } /* @@ -1199,7 +1199,7 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), - INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc), INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ @@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) ds->pebs_interrupt_threshold = threshold; } +#define PEBS_DATACFG_CNTRS(x) \ + ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK) + +#define PEBS_DATACFG_CNTR_BIT(x) \ + (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT) + +#define PEBS_DATACFG_FIX(x) \ + ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK) + +#define PEBS_DATACFG_FIX_BIT(x) \ + (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \ + << PEBS_DATACFG_FIX_SHIFT) + static void adaptive_pebs_record_size_update(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1308,10 +1321,58 @@ static void adaptive_pebs_record_size_update(void) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); + if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) { + sz += sizeof(struct pebs_cntr_header); + + /* Metrics base and Metrics Data */ + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + sz += 2 * sizeof(u64); + + if (pebs_data_cfg & PEBS_DATACFG_CNTR) { + sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) + + hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) * + sizeof(u64); + } + } cpuc->pebs_record_size = sz; } +static void __intel_pmu_pebs_update_cfg(struct perf_event *event, + int idx, u64 *pebs_data_cfg) +{ + if (is_metric_event(event)) { + *pebs_data_cfg |= PEBS_DATACFG_METRICS; + return; + } + + *pebs_data_cfg |= PEBS_DATACFG_CNTR; + + if (idx >= INTEL_PMC_IDX_FIXED) + *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED); + else + *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx); +} + + +static void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + u64 pebs_data_cfg = 0; + int i; + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + if (!is_pebs_counter_event_group(event)) + continue; + __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg); + } + + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; +} + #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_WEIGHT_TYPE | \ @@ -1914,12 +1975,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc) +{ + int shift = 64 - x86_pmu.cntval_bits; + struct hw_perf_event *hwc; + u64 delta, prev_pmc; + + /* + * A recorded counter may not have an assigned event in the + * following cases. The value should be dropped. + * - An event is deleted. There is still an active PEBS event. + * The PEBS record doesn't shrink on pmu::del(). + * If the counter of the deleted event once occurred in a PEBS + * record, PEBS still records the counter until the counter is + * reassigned. + * - An event is stopped for some reason, e.g., throttled. + * During this period, another event is added and takes the + * counter of the stopped event. The stopped event is assigned + * to another new and uninitialized counter, since the + * x86_pmu_start(RELOAD) is not invoked for a stopped event. + * The PEBS__DATA_CFG is updated regardless of the event state. + * The uninitialized counter can be recorded in a PEBS record. + * But the cpuc->events[uninitialized_counter] is always NULL, + * because the event is stopped. The uninitialized value is + * safely dropped. + */ + if (!event) + return; + + hwc = &event->hw; + prev_pmc = local64_read(&hwc->prev_count); + + /* Only update the count when the PMU is disabled */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + local64_set(&hwc->prev_count, pmc); + + delta = (pmc << shift) - (prev_pmc << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct pebs_cntr_header *cntr, + void *next_record) +{ + int bit; + + for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) { + intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record); + next_record += sizeof(u64); + } + + for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) { + /* The slots event will be handled with perf_metric later */ + if ((cntr->metrics == INTEL_CNTR_METRICS) && + (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) { + next_record += sizeof(u64); + continue; + } + intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED], + *(u64 *)next_record); + next_record += sizeof(u64); + } + + /* HW will reload the value right after the overflow. */ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period); + + if (cntr->metrics == INTEL_CNTR_METRICS) { + static_call(intel_pmu_update_topdown_event) + (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS], + (u64 *)next_record); + next_record += 2 * sizeof(u64); + } +} + #define PEBS_LATENCY_MASK 0xffff /* * With adaptive PEBS the layout depends on what fields are configured. */ - static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -2049,6 +2187,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } + if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) { + struct pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct pebs_cntr_header); + /* + * The PEBS_DATA_CFG is a global register, which is the + * superset configuration for all PEBS events. + * For the PEBS record of non-sample-read group, ignore + * the counter snapshot fields. + */ + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + WARN_ONCE(next_record != __pebs + basic->format_size, "PEBS record size %u, expected %llu, config %llx\n", basic->format_size, @@ -2094,15 +2254,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) return NULL; } -void intel_pmu_auto_reload_read(struct perf_event *event) -{ - WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)); - - perf_pmu_disable(event->pmu); - intel_pmu_drain_pebs_buffer(); - perf_pmu_enable(event->pmu); -} - /* * Special variant of intel_pmu_save_and_restart() for auto-reload. */ @@ -2211,13 +2362,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event, } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); + if ((is_pebs_counter_event_group(event))) { + /* + * The value of each sample has been updated when setup + * the corresponding sample data. + */ + perf_event_update_userpage(event); + } else { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } } else intel_pmu_save_and_restart(event); } @@ -2552,6 +2711,11 @@ void __init intel_ds_init(void) break; case 6: + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + x86_pmu.late_setup = intel_pmu_late_setup; + } + fallthrough; case 5: x86_pmu.pebs_ept = 1; fallthrough; @@ -2576,7 +2740,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); } - pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual); /* * The PEBS-via-PT is not supported on hybrid platforms, diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 60b3078b7502f..a34e50fc4a8fc 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -347,8 +347,7 @@ void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) { - hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - box->hrtimer.function = uncore_pmu_hrtimer; + hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 31c2771545a6c..353000d4884f7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; } +static inline bool is_pebs_counter_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -669,18 +674,6 @@ enum { #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) -/* - * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture - * of the core. Bits 31-24 indicates its core type (Core or Atom) - * and Bits [23:0] indicates the native model ID of the core. - * Core type and native model ID are defined in below enumerations. - */ -enum hybrid_cpu_type { - HYBRID_INTEL_NONE, - HYBRID_INTEL_ATOM = 0x20, - HYBRID_INTEL_CORE = 0x40, -}; - #define X86_HYBRID_PMU_ATOM_IDX 0 #define X86_HYBRID_PMU_CORE_IDX 1 #define X86_HYBRID_PMU_TINY_IDX 2 @@ -697,11 +690,6 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; -enum atom_native_id { - cmt_native_id = 0x2, /* Crestmont */ - skt_native_id = 0x3, /* Skymont */ -}; - struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -800,6 +788,7 @@ struct x86_pmu { u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + void (*late_setup)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -994,7 +983,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); + enum intel_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { @@ -1107,6 +1096,8 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); +DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1148,6 +1139,12 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); +static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val) +{ + return x86_perf_event_update(event); +} +DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); + static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? @@ -1643,7 +1640,7 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); -void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 6c977c19f2cd7..1d9e385649b57 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ - /* 0x00080 */ +PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 4952faf03e82d..63cada6174922 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -274,8 +274,7 @@ static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) { struct hrtimer *hr = &rapl_pmu->hrtimer; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = rapl_hrtimer_handle; + hrtimer_setup(hr, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, @@ -879,6 +878,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), {}, }; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index cc8c3bd0e7c29..1f7c3082a36db 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 58f4ddecc5fa2..4566000e15c44 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,6 +8,7 @@ generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h +generated-y += cpufeaturemasks.h generic-y += early_ioremap.h generic-y += fprobe.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305c..484dfea35aaa9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,7 +48,7 @@ ".popsection\n" \ "671:" -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" @@ -87,20 +87,19 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; -struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod); -extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); -extern void apply_returns(s32 *start, s32 *end, struct module *mod); -extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi, struct module *mod); + s32 *start_cfi, s32 *end_cfi); + +struct module; struct callthunk_sites { s32 *call_start, *call_end; - struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS @@ -237,10 +236,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * references: i.e., if used for a function, it would add the PLT * suffix. */ -#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ +#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ : ALT_OUTPUT_SP(output) \ - : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + : [old] "i" (oldfunc), [new] "i" (newfunc) \ + COMMA(input) \ + : clobbers) /* * Like alternative_call, but there are two features and respective functions. @@ -249,24 +250,14 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, old function is used. */ #define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ - output, input...) \ + output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ "call %c[new2]", ft_flags2) \ : ALT_OUTPUT_SP(output) \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) - -/* - * use this macro(s) if you need more than one output parameter - * in alternative_io - */ -#define ASM_OUTPUT2(a...) a - -/* - * use this macro if you need clobbers but no inputs in - * alternative_{input,io,call}() - */ -#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr + [new2] "i" (newfunc2) \ + COMMA(input) \ + : clobbers) #define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__ diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index cb2a5e113daa7..77f3a589a99ac 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 4c4efb93045ed..adfa0854cf2da 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -27,7 +27,6 @@ struct amd_l3_cache { }; struct amd_northbridge { - struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h index 113ad3e8ee40a..23fe617898a8f 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd_node.h @@ -30,7 +30,31 @@ static inline u16 amd_num_nodes(void) return topology_amd_nodes_per_pkg() * topology_max_packages(); } +#ifdef CONFIG_AMD_NODE int __must_check amd_smn_read(u16 node, u32 address, u32 *value); int __must_check amd_smn_write(u16 node, u32 address, u32 value); +/* Should only be used by the HSMP driver. */ +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write); +#else +static inline int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { return -ENODEV; } +static inline int __must_check amd_smn_write(u16 node, u32 address, u32 value) { return -ENODEV; } + +static inline int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_NODE */ + +/* helper for use with read_poll_timeout */ +static inline int smn_read_register(u32 reg) +{ + int data, rc; + + rc = amd_smn_read(0, reg, &data); + if (rc) + return rc; + + return data; +} #endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f21ff19326994..c903d358405d3 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -99,8 +99,8 @@ static inline void native_apic_mem_write(u32 reg, u32 v) volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, - ASM_OUTPUT2("=r" (v), "=m" (*addr)), - ASM_OUTPUT2("0" (v), "m" (*addr))); + ASM_OUTPUT("=r" (v), "=m" (*addr)), + ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 3674006e39744..8d9e62725202e 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -16,7 +16,7 @@ #include #include -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 extern void cmpxchg8b_emu(void); #endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 2bec0c89a95c2..975ae7a9397eb 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -213,6 +213,17 @@ static __always_inline __pure void *rip_rel_ptr(void *p) /* For C file, we already have NOKPROBE_SYMBOL macro */ +/* Insert a comma if args are non-empty */ +#define COMMA(x...) __COMMA(x) +#define __COMMA(...) , ##__VA_ARGS__ + +/* + * Combine multiple asm inline constraint args into a single arg for passing to + * another macro. + */ +#define ASM_OUTPUT(x...) x +#define ASM_INPUT(x...) x + /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6c6e9b9f98a45..ab838205c1c66 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -48,17 +48,20 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) ATOMIC64_EXPORT(atomic64_##sym) #endif -#ifdef CONFIG_X86_CMPXCHG64 -#define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %c[func]" \ +#ifdef CONFIG_X86_CX8 +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + asm volatile("call %c[func]" \ : ALT_OUTPUT_SP(out) \ - : [func] "i" (atomic64_##g##_cx8), ## in) + : [func] "i" (atomic64_##g##_cx8) \ + COMMA(in) \ + : clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define __alternative_atomic64(f, g, out, in...) \ - alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ - X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT(out), \ + ASM_INPUT(in), clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ ATOMIC64_DECL_ONE(sym##_386) @@ -69,8 +72,8 @@ ATOMIC64_DECL_ONE(inc_386); ATOMIC64_DECL_ONE(dec_386); #endif -#define alternative_atomic64(f, out, in...) \ - __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) +#define alternative_atomic64(f, out, in, clobbers...) \ + __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers) ATOMIC64_DECL(read); ATOMIC64_DECL(set); @@ -105,9 +108,10 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - alternative_atomic64(xchg, "=&A" (o), - "S" (v), "b" (low), "c" (high) - : "memory"); + alternative_atomic64(xchg, + "=&A" (o), + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "memory"); return o; } #define arch_atomic64_xchg arch_atomic64_xchg @@ -116,23 +120,25 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - alternative_atomic64(set, /* no output */, - "S" (v), "b" (low), "c" (high) - : "eax", "edx", "memory"); + alternative_atomic64(set, + /* no output */, + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "eax", "edx", "memory"); } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; - alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); + alternative_atomic64(read, "=&A" (r), "c" (v), "memory"); return r; } static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_add_return arch_atomic64_add_return @@ -140,8 +146,9 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_sub_return arch_atomic64_sub_return @@ -149,8 +156,10 @@ static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; - alternative_atomic64(inc_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(inc_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return @@ -158,8 +167,10 @@ static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; - alternative_atomic64(dec_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(dec_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_dec_return arch_atomic64_dec_return @@ -167,28 +178,34 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_inc(atomic64_t *v) { - __alternative_atomic64(inc, inc_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(inc, inc_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_dec(atomic64_t *v) { - __alternative_atomic64(dec, dec_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(dec, dec_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_dec arch_atomic64_dec @@ -197,8 +214,9 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), - "S" (v) : "memory"); + ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)), + "S" (v), + "memory"); return (int)a; } #define arch_atomic64_add_unless arch_atomic64_add_unless @@ -206,8 +224,10 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; - alternative_atomic64(inc_not_zero, "=&a" (r), - "S" (v) : "ecx", "edx", "memory"); + alternative_atomic64(inc_not_zero, + "=&a" (r), + "S" (v), + "ecx", "edx", "memory"); return r; } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero @@ -215,8 +235,10 @@ static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; - alternative_atomic64(dec_if_positive, "=&A" (r), - "S" (v) : "ecx", "memory"); + alternative_atomic64(dec_if_positive, + "=&A" (r), + "S" (v), + "ecx", "memory"); return r; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 7b44b3c4cce11..db70832232d4a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define __mb() asm volatile("mfence":::"memory") @@ -50,7 +50,7 @@ #define __dma_rmb() barrier() #define __dma_wmb() barrier() -#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") +#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index e85ac0c7c039e..f0e9acf725471 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -17,13 +17,17 @@ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. */ #define INSN_ASOP 0x67 +#define INSN_LOCK 0xf0 #define OPCODE_ESCAPE 0x0f #define SECOND_BYTE_OPCODE_UD1 0xb9 #define SECOND_BYTE_OPCODE_UD2 0x0b #define BUG_NONE 0xffff -#define BUG_UD1 0xfffe -#define BUG_UD2 0xfffd +#define BUG_UD2 0xfffe +#define BUG_UD1 0xfffd +#define BUG_UD1_UBSAN 0xfffc +#define BUG_EA 0xffea +#define BUG_LOCK 0xfff0 #ifdef CONFIG_GENERIC_BUG diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 31d19c815f992..3e51ba4591546 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -101,6 +101,16 @@ enum cfi_mode { extern enum cfi_mode cfi_mode; +#ifdef CONFIG_FINEIBT_BHI +extern bool cfi_bhi; +#else +#define cfi_bhi (0) +#endif + +typedef u8 bhi_thunk[32]; +extern bhi_thunk __bhi_args[]; +extern bhi_thunk __bhi_args_end[]; + struct pt_regs; #ifdef CONFIG_CFI_CLANG @@ -125,6 +135,18 @@ static inline int cfi_get_offset(void) #define cfi_get_offset cfi_get_offset extern u32 cfi_get_func_hash(void *func); +extern int cfi_get_func_arity(void *func); + +#ifdef CONFIG_FINEIBT +extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type); +#else +static inline bool +decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + return false; +} + +#endif #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) @@ -137,6 +159,10 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } +static inline int cfi_get_func_arity(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ #if HAS_KERNEL_IBT == 1 diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 5612648b02022..fd8afc1f5f6bb 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -134,7 +134,7 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) #define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + __raw_cmpxchg((ptr), (old), (new), (size), "lock ") #define __cmpxchg_local(ptr, old, new, size) \ __raw_cmpxchg((ptr), (old), (new), (size), "") @@ -222,7 +222,7 @@ extern void __add_wrong_size(void) __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) #define __sync_try_cmpxchg(ptr, pold, new, size) \ - __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ") #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783ddb..6d7afee2fa073 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -69,7 +69,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define arch_cmpxchg64 __cmpxchg64 @@ -91,19 +91,21 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ o.full; \ }) static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock "); } #define arch_cmpxchg64 arch_cmpxchg64 @@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ - "+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ if (unlikely(!ret)) \ *(_oldp) = o.full; \ @@ -136,7 +140,7 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock "); } #define arch_try_cmpxchg64 arch_try_cmpxchg64 diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index aa6c8f8ca9588..e7225452963f0 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -15,6 +15,11 @@ enum cc_vendor { extern enum cc_vendor cc_vendor; extern u64 cc_mask; +static inline u64 cc_get_mask(void) +{ + return cc_mask; +} + static inline void cc_set_mask(u64 mask) { RIP_REL_REF(cc_mask) = mask; @@ -25,7 +30,10 @@ u64 cc_mkdec(u64 val); void cc_random_init(void); #else #define cc_vendor (CC_VENDOR_NONE) -static const u64 cc_mask = 0; +static inline u64 cc_get_mask(void) +{ + return 0; +} static inline u64 cc_mkenc(u64 val) { diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 98eced5084ca7..ad235dda1ded0 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -12,7 +12,6 @@ #ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 -#define safe_smp_processor_id() 0 #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU @@ -50,20 +49,6 @@ static inline void split_lock_init(void) {} static inline void bus_lock_init(void) {} #endif -#ifdef CONFIG_CPU_SUP_INTEL -u8 get_this_hybrid_cpu_type(void); -u32 get_this_hybrid_cpu_native_id(void); -#else -static inline u8 get_this_hybrid_cpu_type(void) -{ - return 0; -} - -static inline u32 get_this_hybrid_cpu_native_id(void) -{ - return 0; -} -#endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index de1ad09fe8d72..910601c59d319 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -9,6 +9,7 @@ #include #include #include +#include enum cpuid_leafs { @@ -37,92 +38,19 @@ enum cpuid_leafs NR_CPUID_WORDS, }; -#define X86_CAP_FMT_NUM "%d:%d" -#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) - extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define X86_CAP_FMT "%s" -#define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_ - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; +#define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) -/* - * There are 32 bits/features in each mask word. The high bits - * (selected with (bit>>5) give us the word number and the low 5 - * bits give us the bit/feature number inside the word. - * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can - * see if it is set in the mask word. - */ -#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ - (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) - -/* - * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the - * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all - * header macros which use NCAPINTS need to be changed. The duplicated macro - * use causes the compiler to issue errors for all headers so that all usage - * sites can be corrected. - */ -#define REQUIRED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ - REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) - -#define DISABLED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ - DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) - #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 508c0dad116bc..8b7cf13e0acb2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -2,14 +2,6 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include -#endif - /* * Defines x86 CPU feature bits */ @@ -210,7 +202,6 @@ #define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ @@ -338,6 +329,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */ #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ @@ -468,6 +460,10 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ #define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */ +#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /* + * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs. + * (SRSO_MSR_FIX in the official doc). + */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -534,4 +530,5 @@ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ +#define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index b2b9b4ef3dae1..a92e4b08820ad 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -6,6 +6,7 @@ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include #include #include diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index bf5953883ec36..dea7d8b854f0e 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -12,41 +12,17 @@ struct task_struct; -struct pcpu_hot { - union { - struct { - struct task_struct *current_task; - int preempt_count; - int cpu_number; -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING - u64 call_depth; -#endif - unsigned long top_of_stack; - void *hardirq_stack_ptr; - u16 softirq_pending; -#ifdef CONFIG_X86_64 - bool hardirq_stack_inuse; -#else - void *softirq_stack_ptr; -#endif - }; - u8 pad[64]; - }; -}; -static_assert(sizeof(struct pcpu_hot) == 64); - -DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); - -/* const-qualified alias to pcpu_hot, aliased by linker. */ -DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, - const_pcpu_hot); +DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override, + const_current_task); static __always_inline struct task_struct *get_current(void) { if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return this_cpu_read_const(const_pcpu_hot.current_task); + return this_cpu_read_const(const_current_task); - return this_cpu_read_stable(pcpu_hot.current_task); + return this_cpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 62dc9f59ea768..ec95fe44fa3a0 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -46,7 +46,6 @@ struct gdt_page { } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -DECLARE_INIT_PER_CPU(gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index c492bdc97b059..0000000000000 --- a/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_MITIGATION_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_MITIGATION_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_MITIGATION_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING -# define DISABLE_CALL_DEPTH_TRACKING 0 -#else -# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -#endif - -#ifdef CONFIG_ADDRESS_MASKING -# define DISABLE_LAM 0 -#else -# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_XEN_PV -# define DISABLE_XENPV 0 -#else -# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -#ifdef CONFIG_X86_USER_SHADOW_STACK -#define DISABLE_USER_SHSTK 0 -#else -#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) -#endif - -#ifdef CONFIG_X86_KERNEL_IBT -#define DISABLE_IBT 0 -#else -#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) -#endif - -#ifdef CONFIG_X86_FRED -# define DISABLE_FRED 0 -#else -# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) -#endif - -#ifdef CONFIG_KVM_AMD_SEV -#define DISABLE_SEV_SNP 0 -#else -#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 (DISABLE_SEV_SNP) -#define DISABLED_MASK20 0 -#define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 2e74a7f0e9357..c83645d5b2a86 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -29,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void); extern u64 e820__memblock_alloc_reserved(u64 size, u64 align); extern void e820__memblock_setup(void); -extern void e820__reserve_setup_data(void); extern void e820__finish_early_params(void); extern void e820__reserve_resources(void); extern void e820__reserve_resources_late(void); diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 314f75d886d08..80c4a7266629c 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -35,15 +35,6 @@ enum e820_type { * marking it with the IORES_DESC_SOFT_RESERVED designation. */ E820_TYPE_SOFT_RESERVED = 0xefffffff, - - /* - * Reserved RAM used by the kernel itself if - * CONFIG_INTEL_TXT=y is enabled, memory of this type - * will be included in the S3 integrity calculation - * and so should not include any memory that the BIOS - * might alter over the S3 transition: - */ - E820_TYPE_RESERVED_KERN = 128, }; /* diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index 426fc53ff8030..dfbd1ebb9f10b 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -13,7 +13,7 @@ static inline void edac_atomic_scrub(void *va, u32 size) * are interrupt, DMA and SMP safe. */ for (i = 0; i < size / 4; i++, virt_addr++) - asm volatile("lock; addl $0, %0"::"m" (*virt_addr)); + asm volatile("lock addl $0, %0"::"m" (*virt_addr)); } #endif /* _ASM_X86_EDAC_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1fb83d47711f9..128602612ecaa 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -54,8 +54,9 @@ typedef struct user_i387_struct elf_fpregset_t; #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ -#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative - offset to GOT */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ +#define R_X86_64_GOTPCRELX 41 +#define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f86ad3335529d..f42de5f05e7eb 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,10 +16,9 @@ /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It - * disables preemption so be careful if you intend to use it for long periods - * of time. - * If you intend to use the FPU in irq/softirq you need to check first with - * irq_fpu_usable() if it is possible. + * disables preemption and softirq processing, so be careful if you intend to + * use it for long periods of time. Kernel-mode FPU cannot be used in all + * contexts -- see irq_fpu_usable() for details. */ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ @@ -50,10 +49,10 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. - * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in - * a random state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while + * using the FPU in kernel mode. A context switch will (and softirq might) save + * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving + * CPU's FPU registers in a random state. * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. @@ -63,8 +62,6 @@ static inline void kernel_fpu_begin(void) * preemptible. Disabling preemption is the right choice here as bottom * half processing is always in thread context on RT kernels so it * implicitly prevents bottom half processing as well. - * - * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f9cb4d07df58f..f2265246249ad 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -36,21 +36,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) { -#ifdef CONFIG_X86_KERNEL_IBT - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) + if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; -#endif + return fentry_ip; } #define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6ffa8b75f4cd3..f00c09ffe6a95 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,7 +3,6 @@ #define _ASM_X86_HARDIRQ_H #include -#include typedef struct { #if IS_ENABLED(CONFIG_KVM_INTEL) @@ -66,7 +65,8 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat -#define local_softirq_pending_ref pcpu_hot.softirq_pending +DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +#define local_softirq_pending_ref __softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) /* diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 1e59581d500ca..9423a2967f506 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -41,7 +41,7 @@ _ASM_PTR fname "\n\t" \ ".popsection\n\t" -static inline __attribute_const__ u32 gen_endbr(void) +static __always_inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -56,7 +56,7 @@ static inline __attribute_const__ u32 gen_endbr(void) return endbr; } -static inline __attribute_const__ u32 gen_endbr_poison(void) +static __always_inline __attribute_const__ u32 gen_endbr_poison(void) { /* * 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it @@ -65,15 +65,20 @@ static inline __attribute_const__ u32 gen_endbr_poison(void) return 0x001f0f66; /* osp nopl (%rax) */ } -static inline bool is_endbr(u32 val) +static inline bool __is_endbr(u32 val) { if (val == gen_endbr_poison()) return true; + /* See cfi_fineibt_bhi_preamble() */ + if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5) + return true; + val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */ return val == gen_endbr(); } +extern __noendbr bool is_endbr(u32 *val); extern __noendbr u64 ibt_save(bool disable); extern __noendbr void ibt_restore(u64 save); @@ -98,7 +103,7 @@ extern __noendbr void ibt_restore(u64 save); #define __noendbr -static inline bool is_endbr(u32 val) { return false; } +static inline bool is_endbr(u32 *val) { return false; } static inline u64 ibt_save(bool disable) { return 0; } static inline void ibt_restore(u64 save) { } diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6d7b04ffc5fd0..b657d78071c68 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -110,9 +110,9 @@ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */ -#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ @@ -126,16 +126,16 @@ #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_METEORLAKE IFM(6, 0xAC) +#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */ #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_ARROWLAKE_H IFM(6, 0xC5) +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */ #define INTEL_ARROWLAKE IFM(6, 0xC6) #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_LUNARLAKE_M IFM(6, 0xBD) +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ /* "Small Core" Processors (Atom/E-Core) */ @@ -149,9 +149,9 @@ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ @@ -182,10 +182,23 @@ /* Family 19 */ #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ -/* CPU core types */ +/* + * Intel CPU core types + * + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ enum intel_cpu_type { + INTEL_CPU_TYPE_UNKNOWN, INTEL_CPU_TYPE_ATOM = 0x20, INTEL_CPU_TYPE_CORE = 0x40, }; +enum intel_native_id { + INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */ + INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */ +}; + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ed580c7f9d0aa..1a0dc2b2bf5b8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); +#define arch_memremap_wb arch_memremap_wb + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 562a547c29a5b..735c3a491f604 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ + __this_cpu_write(hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ + __this_cpu_write(hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ + __this_cpu_write(hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ + __this_cpu_write(hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 8ad187462b68e..e3589d6aec24f 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -71,41 +72,32 @@ static inline void crash_setup_regs(struct pt_regs *newregs, if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); } else { + asm volatile("mov %%" _ASM_BX ",%0" : "=m"(newregs->bx)); + asm volatile("mov %%" _ASM_CX ",%0" : "=m"(newregs->cx)); + asm volatile("mov %%" _ASM_DX ",%0" : "=m"(newregs->dx)); + asm volatile("mov %%" _ASM_SI ",%0" : "=m"(newregs->si)); + asm volatile("mov %%" _ASM_DI ",%0" : "=m"(newregs->di)); + asm volatile("mov %%" _ASM_BP ",%0" : "=m"(newregs->bp)); + asm volatile("mov %%" _ASM_AX ",%0" : "=m"(newregs->ax)); + asm volatile("mov %%" _ASM_SP ",%0" : "=m"(newregs->sp)); +#ifdef CONFIG_X86_64 + asm volatile("mov %%r8,%0" : "=m"(newregs->r8)); + asm volatile("mov %%r9,%0" : "=m"(newregs->r9)); + asm volatile("mov %%r10,%0" : "=m"(newregs->r10)); + asm volatile("mov %%r11,%0" : "=m"(newregs->r11)); + asm volatile("mov %%r12,%0" : "=m"(newregs->r12)); + asm volatile("mov %%r13,%0" : "=m"(newregs->r13)); + asm volatile("mov %%r14,%0" : "=m"(newregs->r14)); + asm volatile("mov %%r15,%0" : "=m"(newregs->r15)); +#endif + asm volatile("mov %%ss,%k0" : "=a"(newregs->ss)); + asm volatile("mov %%cs,%k0" : "=a"(newregs->cs)); #ifdef CONFIG_X86_32 - asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); - asm volatile("movl %%ecx,%0" : "=m"(newregs->cx)); - asm volatile("movl %%edx,%0" : "=m"(newregs->dx)); - asm volatile("movl %%esi,%0" : "=m"(newregs->si)); - asm volatile("movl %%edi,%0" : "=m"(newregs->di)); - asm volatile("movl %%ebp,%0" : "=m"(newregs->bp)); - asm volatile("movl %%eax,%0" : "=m"(newregs->ax)); - asm volatile("movl %%esp,%0" : "=m"(newregs->sp)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds)); - asm volatile("movl %%es, %%eax;" :"=a"(newregs->es)); - asm volatile("pushfl; popl %0" :"=m"(newregs->flags)); -#else - asm volatile("movq %%rbx,%0" : "=m"(newregs->bx)); - asm volatile("movq %%rcx,%0" : "=m"(newregs->cx)); - asm volatile("movq %%rdx,%0" : "=m"(newregs->dx)); - asm volatile("movq %%rsi,%0" : "=m"(newregs->si)); - asm volatile("movq %%rdi,%0" : "=m"(newregs->di)); - asm volatile("movq %%rbp,%0" : "=m"(newregs->bp)); - asm volatile("movq %%rax,%0" : "=m"(newregs->ax)); - asm volatile("movq %%rsp,%0" : "=m"(newregs->sp)); - asm volatile("movq %%r8,%0" : "=m"(newregs->r8)); - asm volatile("movq %%r9,%0" : "=m"(newregs->r9)); - asm volatile("movq %%r10,%0" : "=m"(newregs->r10)); - asm volatile("movq %%r11,%0" : "=m"(newregs->r11)); - asm volatile("movq %%r12,%0" : "=m"(newregs->r12)); - asm volatile("movq %%r13,%0" : "=m"(newregs->r13)); - asm volatile("movq %%r14,%0" : "=m"(newregs->r14)); - asm volatile("movq %%r15,%0" : "=m"(newregs->r15)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("pushfq; popq %0" :"=m"(newregs->flags)); + asm volatile("mov %%ds,%k0" : "=a"(newregs->ds)); + asm volatile("mov %%es,%k0" : "=a"(newregs->es)); #endif + asm volatile("pushf\n\t" + "pop %0" : "=m"(newregs->flags)); newregs->ip = _THIS_IP_; } } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b7af5902ff75..32ae3aa50c7e3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -780,6 +780,7 @@ struct kvm_vcpu_arch { u32 pkru; u32 hflags; u64 efer; + u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index dc31b13b87a0d..4835c67bb5ddb 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -119,33 +119,27 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ #define SYM_FUNC_START_NOALIGN(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ #define SYM_FUNC_START_LOCAL_NOALIGN(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ #define SYM_FUNC_START_WEAK_NOALIGN(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_A_NONE) #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index eb2db07ef39c8..6c77c03139f7f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -296,8 +296,6 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -bool mce_notify_irq(void); - DECLARE_PER_CPU(struct mce, injectm); /* Disable CMCI/polling for MCA bank claimed by firmware */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 3b496cdcb74b3..8b8055a8eb9ee 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -69,6 +69,18 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif + +#ifdef CONFIG_BROADCAST_TLB_FLUSH + /* + * The global ASID will be a non-zero value when the process has + * the same ASID across all CPUs, allowing it to make use of + * hardware-assisted remote TLB invalidation like AMD INVLPGB. + */ + u16 global_asid; + + /* The process is transitioning to a new global ASID number. */ + bool asid_transition; +#endif } mm_context_t; #define INIT_MM_CONTEXT(mm) \ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 795fdd53bd0a6..2398058b6e83c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H -#include #include #include #include @@ -13,6 +12,7 @@ #include #include #include +#include extern atomic64_t last_mm_ctx_id; @@ -139,6 +139,11 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm) #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); +#define mm_init_global_asid mm_init_global_asid +extern void mm_init_global_asid(struct mm_struct *mm); + +extern void mm_free_global_asid(struct mm_struct *mm); + /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). @@ -161,6 +166,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif + + mm_init_global_asid(mm); mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; @@ -170,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk, static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); + mm_free_global_asid(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f91ab1e75f9ff..5e6193dbc97e3 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -77,11 +77,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_tdx_hypercall(control, input_address, output_address); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address) + : [output_address] "r" (output_address) : "cc", "memory", "r8", "r9", "r10", "r11"); return hv_status; } @@ -89,12 +89,12 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), - THUNK_TARGET(hv_hypercall_pg) + : [output_address] "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -187,18 +187,18 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) return hv_tdx_hypercall(control, input1, input2); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2) + : [input2] "r" (input2) : "cc", "r8", "r9", "r10", "r11"); } else { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2), + : [input2] "r" (input2), THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 72765b2fe0d87..bc6d2de109b51 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) @@ -34,6 +35,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* @@ -721,6 +723,7 @@ /* Zen4 */ #define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 /* Fam 19h MSRs */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 41a0ebb699ec6..f677382093f36 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -56,6 +56,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); + void stop_nmi(void); void restart_nmi(void); void local_touch_nmi(void); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e8bf78c03d5d..e8757d7a35825 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * Call depth tracking for Intel SKL CPUs to address the RSB underflow @@ -78,21 +77,21 @@ #include #define CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq $-1, PER_CPU_VAR(__x86_call_depth); #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq %rax, PER_CPU_VAR(__x86_call_depth); #define RESET_CALL_DEPTH_FROM_CALL \ movb $0xfc, %al; \ shl $56, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + movq %rax, PER_CPU_VAR(__x86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + sarq $5, PER_CPU_VAR(__x86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else @@ -198,9 +197,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -388,6 +386,8 @@ extern void call_depth_return_thunk(void); __stringify(INCREMENT_CALL_DEPTH), \ X86_FEATURE_CALL_DEPTH) +DECLARE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); + #ifdef CONFIG_CALL_THUNKS_DEBUG DECLARE_PER_CPU(u64, __x86_call_count); DECLARE_PER_CPU(u64, __x86_ret_count); @@ -420,20 +420,27 @@ static inline void call_depth_return_thunk(void) {} #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_MITIGATION_RETPOLINE +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -515,7 +522,7 @@ extern u64 x86_pred_cmd; static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); + alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB); } /* The Intel SPEC CTRL MSR base value cache */ @@ -552,6 +559,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb); + DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index faf9cc1c14bb6..25c32652f4042 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -11,8 +11,8 @@ * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. + * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G + * and CONFIG_HIGHMEM4G options in the kernel configuration. */ #define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) #define __PAGE_OFFSET __PAGE_OFFSET_BASE diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index d63576608ce76..b5279f5d56014 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -55,11 +55,12 @@ static inline void clear_page(void *page) clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), - "D" (page) - : "cc", "memory", "rax", "rcx"); + "D" (page), + "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); +KCFI_REFERENCE(copy_page); #ifdef CONFIG_X86_5LEVEL /* diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 041aff51eb50f..38a632a282d40 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); -} - static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fea56b04f4365..127a372dacc94 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -134,8 +134,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); - /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); @@ -242,9 +240,17 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) -int paravirt_disable_iospace(void); - -/* This generates an indirect call based on the operation type number. */ +/* + * This generates an indirect call based on the operation type number. + * + * Since alternatives run after enabling CET/IBT -- the latter setting/clearing + * capabilities and the former requiring all capabilities being finalized -- + * these indirect calls are subject to IBT and the paravirt stubs should have + * ENDBR on. + * + * OTOH since this is effectively a __nocfi indirect call, the paravirt stubs + * don't need to bother with CFI prefixes. + */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999f..462d071c87d46 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -20,14 +20,9 @@ #define PER_CPU_VAR(var) __percpu(var)__percpu_rel -#ifdef CONFIG_X86_64_SMP -# define INIT_PER_CPU_VAR(var) init_per_cpu__##var -#else -# define INIT_PER_CPU_VAR(var) var -#endif - #else /* !__ASSEMBLY__: */ +#include #include #include #include @@ -41,12 +36,7 @@ # define __seg_fs __attribute__((address_space(__seg_fs))) #endif -#ifdef CONFIG_X86_64 -# define __percpu_seg_override __seg_gs -#else -# define __percpu_seg_override __seg_fs -#endif - +#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) #define __percpu_prefix "" #else /* !CONFIG_CC_HAS_NAMED_AS: */ @@ -97,22 +87,6 @@ #define __percpu_arg(x) __percpu_prefix "%" #x #define __force_percpu_arg(x) __force_percpu_prefix "%" #x -/* - * Initialized pointers to per-CPU variables needed for the boot - * processor need to use these macros to get the proper address - * offset from __per_cpu_load on SMP. - * - * There also must be an entry in vmlinux_64.lds.S - */ -#define DECLARE_INIT_PER_CPU(var) \ - extern typeof(var) init_per_cpu_var(var) - -#ifdef CONFIG_X86_64_SMP -# define init_per_cpu_var(var) init_per_cpu__##var -#else -# define init_per_cpu_var(var) var -#endif - /* * For arch-specific code, we can use direct single-insn ops (they * don't give an lvalue though). @@ -128,15 +102,10 @@ #define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) #define __pcpu_cast_8(val) ((u64)(val)) -#define __pcpu_op1_1(op, dst) op "b " dst -#define __pcpu_op1_2(op, dst) op "w " dst -#define __pcpu_op1_4(op, dst) op "l " dst -#define __pcpu_op1_8(op, dst) op "q " dst - -#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst -#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst -#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst -#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst +#define __pcpu_op_1(op) op "b " +#define __pcpu_op_2(op) op "w " +#define __pcpu_op_4(op) op "l " +#define __pcpu_op_8(op) op "q " #define __pcpu_reg_1(mod, x) mod "q" (x) #define __pcpu_reg_2(mod, x) mod "r" (x) @@ -168,7 +137,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \ + asm qual (__pcpu_op_##size("mov") \ + __percpu_arg([var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "m" (__my_cpu_var(_var))); \ \ @@ -184,7 +154,8 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("mov") "%[val], " \ + __percpu_arg([var]) \ : [var] "=m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -201,7 +172,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \ + asm(__pcpu_op_##size("mov") \ + __force_percpu_arg(a[var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "i" (&(_var))); \ \ @@ -210,7 +182,7 @@ do { \ #define percpu_unary_op(size, qual, op, _var) \ ({ \ - asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var))); \ }) @@ -223,7 +195,7 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -259,8 +231,8 @@ do { \ ({ \ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ \ - asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("xadd") "%[tmp], " \ + __percpu_arg([var]) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ @@ -303,8 +275,8 @@ do { \ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ : [oval] "+a" (pco_old__), \ [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ @@ -320,8 +292,8 @@ do { \ __pcpu_type_##size pco_old__ = *pco_oval__; \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ @@ -348,15 +320,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -378,17 +349,16 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ \ @@ -419,15 +389,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -449,19 +418,19 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ + \ likely(success); \ }) @@ -582,7 +551,7 @@ do { \ * it is accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across CPUs. The current users include - * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * current_task and cpu_current_top_of_stack, both of which are * actually per-thread variables implemented as per-CPU variables and * thus stable for the duration of the respective task. */ @@ -617,7 +586,7 @@ do { \ #include /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0ba8d20f2d1d5..812dac3f79f04 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,6 +141,12 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -482,6 +488,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ @@ -509,6 +524,8 @@ struct pebs_xmm { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -534,8 +551,11 @@ struct pebs_xmm { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd4841231bb9f..a33147520044b 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -29,11 +29,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -/* - * Flags to use when allocating a user page table page. - */ -extern gfp_t __userpte_alloc_gfp; - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 7f6ccff0ba727..4a12c276b1812 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -23,17 +23,17 @@ typedef union { #define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* - * traditional i386 two-level paging structure: + * Traditional i386 two-level paging structure: */ #define PGDIR_SHIFT 22 #define PTRS_PER_PGD 1024 - /* - * the i386 is two-level, so we don't really have any - * PMD directory physically. + * The i386 is two-level, so we don't really have any + * PMD directory physically: */ +#define PTRS_PER_PMD 1 #define PTRS_PER_PTE 1024 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4b804531b03c3..74d461cc8e208 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -33,6 +33,7 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 #ifdef CONFIG_X86_64 @@ -64,6 +65,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) @@ -177,7 +179,7 @@ enum page_cache_mode { }; #endif -#define _PAGE_CC (_AT(pteval_t, cc_mask)) +#define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 919909d8cb77e..578441db09f0b 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,10 +4,11 @@ #include #include -#include #include +DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count); + /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -23,18 +24,18 @@ */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { int old, new; - old = raw_cpu_read_4(pcpu_hot.preempt_count); + old = raw_cpu_read_4(__preempt_count); do { new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new)); + } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new)); } /* @@ -43,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -57,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -76,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(pcpu_hot.preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(pcpu_hot.preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -91,7 +92,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e, __percpu_arg([var])); } @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90c..5d2f7e5aff261 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -60,18 +60,13 @@ struct vm86; # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -enum tlb_infos { - ENTRIES, - NR_INFO -}; - -extern u16 __read_mostly tlb_lli_4k[NR_INFO]; -extern u16 __read_mostly tlb_lli_2m[NR_INFO]; -extern u16 __read_mostly tlb_lli_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4k[NR_INFO]; -extern u16 __read_mostly tlb_lld_2m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_1g[NR_INFO]; +extern u16 __read_mostly tlb_lli_4k; +extern u16 __read_mostly tlb_lli_2m; +extern u16 __read_mostly tlb_lli_4m; +extern u16 __read_mostly tlb_lld_4k; +extern u16 __read_mostly tlb_lld_2m; +extern u16 __read_mostly tlb_lld_4m; +extern u16 __read_mostly tlb_lld_1g; /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -234,7 +229,7 @@ static inline unsigned long long l1tf_pfn_limit(void) void init_cpu_devs(void); void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); @@ -420,37 +415,33 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); #ifdef CONFIG_X86_64 -struct fixed_percpu_data { - /* - * GCC hardcodes the stack canary as %gs:40. Since the - * irq_stack is the object at %gs:0, we reserve the bottom - * 48 bytes of the irq stack for the canary. - * - * Once we are willing to require -mstack-protector-guard-symbol= - * support for x86_64 stackprotector, we can get rid of this. - */ - char gs_base[40]; - unsigned long stack_canary; -}; +DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); +#else +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); +#endif -DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; -DECLARE_INIT_PER_CPU(fixed_percpu_data); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override, + const_cpu_current_top_of_stack); +#ifdef CONFIG_X86_64 static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); +#ifdef CONFIG_SMP + return per_cpu_offset(cpu); +#else + return 0; +#endif } extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); -#else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, __stack_chk_guard); -#endif -#endif /* !X86_64 */ +#endif /* X86_64 */ struct perf_event; @@ -561,9 +552,9 @@ static __always_inline unsigned long current_top_of_stack(void) * entry trampoline. */ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return this_cpu_read_const(const_pcpu_hot.top_of_stack); + return this_cpu_read_const(const_cpu_current_top_of_stack); - return this_cpu_read_stable(pcpu_hot.top_of_stack); + return this_cpu_read_stable(cpu_current_top_of_stack); } static __always_inline bool on_thread_stack(void) @@ -668,8 +659,6 @@ static __always_inline void prefetchw(const void *x) .sysenter_cs = __KERNEL_CS, \ } -#define KSTK_ESP(task) (task_pt_regs(task)->sp) - #else extern unsigned long __top_init_kernel_stack[]; @@ -677,8 +666,6 @@ extern unsigned long __top_init_kernel_stack[]; .sp = (unsigned long)&__top_init_kernel_stack, \ } -extern unsigned long KSTK_ESP(struct task_struct *task); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, @@ -692,6 +679,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) @@ -757,6 +745,7 @@ extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, + MDS_MITIGATION_AUTO, MDS_MITIGATION_FULL, MDS_MITIGATION_VMWERV, }; diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 484f4f0131a5c..05224a6958727 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -15,7 +15,6 @@ void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); void entry_SYSRETQ_unsafe_stack(void); void entry_SYSRETQ_end(void); -long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -41,6 +40,6 @@ void x86_configure_nx(void); extern int reboot_force; -long do_arch_prctl_common(int option, unsigned long arg2); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h deleted file mode 100644 index e9187ddd3d1fd..0000000000000 --- a/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK20 0 -#define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index 6652ebddfd02f..8d983cfd06ea6 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -2,6 +2,18 @@ #ifndef _ASM_RUNTIME_CONST_H #define _ASM_RUNTIME_CONST_H +#ifdef __ASSEMBLY__ + +.macro RUNTIME_CONST_PTR sym reg + movq $0x0123456789abcdef, %\reg + 1: + .pushsection runtime_ptr_\sym, "a" + .long 1b - 8 - . + .popsection +.endm + +#else /* __ASSEMBLY__ */ + #define runtime_const_ptr(sym) ({ \ typeof(sym) __ret; \ asm_inline("mov %1,%0\n1:\n" \ @@ -58,4 +70,5 @@ static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), } } +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index cc62ef70ccc0a..8d9f1c9aaa4ce 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H -#include #include #include @@ -38,7 +37,6 @@ int set_memory_rox(unsigned long addr, int numpages); * The caller is required to take care of these. */ -int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot); int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); int _set_memory_wt(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 85f4fde3515c4..a8d676bba5def 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -46,6 +46,7 @@ void setup_bios_corruption_check(void); void early_platform_quirks(void); extern unsigned long saved_video_mode; +extern unsigned long acpi_realmode_flags; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 1581246491b54..ba7999f66abe6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -203,6 +203,9 @@ struct snp_guest_req { unsigned int vmpck_id; u8 msg_version; u8 msg_type; + + struct snp_req_data input; + void *certs_data; }; /* @@ -263,9 +266,6 @@ struct snp_msg_desc { struct snp_guest_msg secret_request, secret_response; struct snp_secrets_page *secrets; - struct snp_req_data input; - - void *certs_data; struct aesgcm_ctx *ctx; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ca073f40698fa..bcfa00232d790 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -3,10 +3,11 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include +#include #include -#include -#include + +DECLARE_PER_CPU_CACHE_HOT(int, cpu_number); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); @@ -114,13 +115,12 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); +void __noreturn mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void smp_store_cpu_info(int id); - asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); __visible void smp_call_function_interrupt(struct pt_regs *regs); @@ -133,14 +133,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) -#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) - -#ifdef CONFIG_X86_32 -extern int safe_smp_processor_id(void); -#else -# define safe_smp_processor_id() smp_processor_id() -#endif +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) static inline struct cpumask *cpu_llc_shared_mask(int cpu) { @@ -164,6 +158,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return (struct cpumask *)cpumask_of(0); } + +static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 03e7c2d495597..21ce480658b14 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -42,14 +42,14 @@ static __always_inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } -static inline unsigned long __native_read_cr3(void) +static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } -static inline void native_write_cr3(unsigned long val) +static __always_inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h deleted file mode 100644 index e0975e9c4f477..0000000000000 --- a/arch/x86/include/asm/sta2x11.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for STMicroelectronics ConneXt (STA2X11) IOHub - */ -#ifndef __ASM_STA2X11_H -#define __ASM_STA2X11_H - -#include - -/* This needs to be called from the MFD to configure its sub-devices */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); - -#endif /* __ASM_STA2X11_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 00473a650f512..cd761b14eb029 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -2,26 +2,10 @@ /* * GCC stack protector support. * - * Stack protector works by putting predefined pattern at the start of + * Stack protector works by putting a predefined pattern at the start of * the stack frame and verifying that it hasn't been overwritten when - * returning from the function. The pattern is called stack canary - * and unfortunately gcc historically required it to be at a fixed offset - * from the percpu segment base. On x86_64, the offset is 40 bytes. - * - * The same segment is shared by percpu area and stack canary. On - * x86_64, percpu symbols are zero based and %gs (64-bit) points to the - * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the appropriate - * offset. On x86_32, the stack canary is just a regular percpu - * variable. - * - * Putting percpu data in %fs on 32-bit is a minor optimization compared to - * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely - * to load 0 into %fs on exit to usermode, whereas with percpu data in - * %gs, we are likely to load a non-null %gs on return to user mode. - * - * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector - * support, we can remove some of this complexity. + * returning from the function. The pattern is called the stack canary + * and is a unique value for each task. */ #ifndef _ASM_STACKPROTECTOR_H @@ -36,6 +20,8 @@ #include +DECLARE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); + /* * Initialize the stackprotector canary value. * @@ -51,25 +37,13 @@ static __always_inline void boot_init_stack_canary(void) { unsigned long canary = get_random_canary(); -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); -#endif - current->stack_canary = canary; -#ifdef CONFIG_X86_64 - this_cpu_write(fixed_percpu_data.stack_canary, canary); -#else this_cpu_write(__stack_chk_guard, canary); -#endif } static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) { -#ifdef CONFIG_X86_64 - per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; -#else per_cpu(__stack_chk_guard, cpu) = idle->stack_canary; -#endif } #else /* STACKPROTECTOR */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 9d0b324eab213..79e9695dc13e4 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -21,6 +21,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); +KCFI_REFERENCE(__memset); /* * KMSAN needs to instrument as much code as possible. Use C versions of @@ -70,6 +71,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); void *__memmove(void *dest, const void *src, size_t count); +KCFI_REFERENCE(__memmove); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 6d8d6bc183b74..cd21a0405ac52 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -31,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" + asm volatile("lock " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -49,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" + asm volatile("lock " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -66,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" + asm volatile("lock " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -82,7 +82,7 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -95,7 +95,7 @@ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -108,7 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 77f52bc1578a7..866ea78ba156a 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,9 @@ static inline void tlb_flush(struct mmu_gather *tlb); #include +#include +#include +#include static inline void tlb_flush(struct mmu_gather *tlb) { @@ -25,4 +28,139 @@ static inline void invlpg(unsigned long addr) asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } +enum addr_stride { + PTE_STRIDE = 0, + PMD_STRIDE = 1 +}; + +/* + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination + * of the three. For example: + * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address + * - FLAG_PCID: invalidate all TLB entries matching the PCID + * + * The first is used to invalidate (kernel) mappings at a particular + * address across all processes. + * + * The latter invalidates all TLB entries matching a PCID. + */ +#define INVLPGB_FLAG_VA BIT(0) +#define INVLPGB_FLAG_PCID BIT(1) +#define INVLPGB_FLAG_ASID BIT(2) +#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) +#define INVLPGB_FLAG_FINAL_ONLY BIT(4) +#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) + +/* The implied mode when all bits are clear: */ +#define INVLPGB_MODE_ALL_NONGLOBALS 0UL + +#ifdef CONFIG_BROADCAST_TLB_FLUSH +/* + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. + * + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can + * be done in a parallel fashion. + * + * The instruction takes the number of extra pages to invalidate, beyond the + * first page, while __invlpgb gets the more human readable number of pages to + * invalidate. + * + * The bits in rax[0:2] determine respectively which components of the address + * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* + * address in the specified range matches. + * + * Since it is desired to only flush TLB entries for the ASID that is executing + * the instruction (a host/hypervisor or a guest), the ASID valid bit should + * always be set. On a host/hypervisor, the hardware will use the ASID value + * specified in EDX[15:0] (which should be 0). On a guest, the hardware will + * use the actual ASID value of the guest. + * + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from + * this CPU have completed. + */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride stride, u8 flags) +{ + u64 rax = addr | flags | INVLPGB_FLAG_ASID; + u32 ecx = (stride << 31) | (nr_pages - 1); + u32 edx = (pcid << 16) | asid; + + /* The low bits in rax are for flags. Verify addr is clean. */ + VM_WARN_ON_ONCE(addr & ~PAGE_MASK); + + /* INVLPGB; supported in binutils >= 2.36. */ + asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); +} + +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) +{ + __invlpgb(asid, pcid, 0, 1, 0, flags); +} + +static inline void __tlbsync(void) +{ + /* + * TLBSYNC waits for INVLPGB instructions originating on the same CPU + * to have completed. Print a warning if the task has been migrated, + * and might not be waiting on all the INVLPGBs issued during this TLB + * invalidation sequence. + */ + cant_migrate(); + + /* TLBSYNC: supported in binutils >= 0.36. */ + asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); +} +#else +/* Some compilers (I'm looking at you clang!) simply can't do DCE */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride s, u8 flags) { } +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } +static inline void __tlbsync(void) { } +#endif + +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, bool stride) +{ + enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; + u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; + + __invlpgb(0, pcid, addr, nr, str, flags); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +{ + __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invlpgb_flush_all(void) +{ + /* + * TLBSYNC at the end needs to make sure all flushes done on the + * current CPU have been executed system-wide. Therefore, make + * sure nothing gets migrated in-between but disable preemption + * as it is cheaper. + */ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); + __tlbsync(); +} + +/* Flush addr, including globals, for all PCIDs. */ +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) +{ + __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invlpgb_flush_all_nonglobals(void) +{ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); + __tlbsync(); +} #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 02fc2aa06e9e0..7cad283d502d0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -183,6 +184,9 @@ static inline void cr4_init_shadow(void) extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; +/* How many pages can be invalidated with one INVLPGB. */ +extern u16 invlpgb_count_max; + extern void initialize_tlbstate_and_flush(void); /* @@ -231,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline bool is_dyn_asid(u16 asid) +{ + return asid < TLB_NR_DYN_ASIDS; +} + +static inline bool is_global_asid(u16 asid) +{ + return !is_dyn_asid(asid); +} + +#ifdef CONFIG_BROADCAST_TLB_FLUSH +static inline u16 mm_global_asid(struct mm_struct *mm) +{ + u16 asid; + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return 0; + + asid = smp_load_acquire(&mm->context.global_asid); + + /* mm->context.global_asid is either 0, or a global ASID */ + VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); + + return asid; +} + +static inline void mm_init_global_asid(struct mm_struct *mm) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + mm->context.global_asid = 0; + mm->context.asid_transition = false; + } +} + +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) +{ + /* + * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> + * finish_asid_transition() needs to observe asid_transition = true + * once it observes global_asid. + */ + mm->context.asid_transition = true; + smp_store_release(&mm->context.global_asid, asid); +} + +static inline void mm_clear_asid_transition(struct mm_struct *mm) +{ + WRITE_ONCE(mm->context.asid_transition, false); +} + +static inline bool mm_in_asid_transition(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + return mm && READ_ONCE(mm->context.asid_transition); +} +#else +static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } +static inline void mm_init_global_asid(struct mm_struct *mm) { } +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } +static inline void mm_clear_asid_transition(struct mm_struct *mm) { } +static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } +#endif /* CONFIG_BROADCAST_TLB_FLUSH */ + #ifdef CONFIG_PARAVIRT #include #endif @@ -242,7 +311,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT, false) + : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1f1deaecd364b..869b880618018 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -35,8 +35,6 @@ static inline int get_si_code(unsigned long condition) return TRAP_BRKPT; } -extern int panic_on_unrecovered_nmi; - void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75..1ee2e5115955c 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d7f6592b74a94..80be0da733df4 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -18,12 +18,6 @@ struct vdso_image { unsigned long extable_base, extable_len; const void *extable; - long sym_vvar_start; /* Negative offset to the vvar area */ - - long sym_vvar_page; - long sym_pvclock_page; - long sym_hvclock_page; - long sym_timens_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h index 2bf9c0e970c3e..af85539f6557b 100644 --- a/arch/x86/include/asm/vdso/getrandom.h +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -27,16 +27,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return ret; } -extern struct vdso_rng_data vdso_rng_data - __attribute__((visibility("hidden"))); - -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - if (IS_ENABLED(CONFIG_TIME_NS) && __arch_get_vdso_data()->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&vdso_rng_data + ((void *)&timens_page - (void *)__arch_get_vdso_data()); - return &vdso_rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 375a34b0f3657..9e52cc46e1da9 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -19,12 +19,6 @@ #include #include -extern struct vdso_data vvar_page - __attribute__((visibility("hidden"))); - -extern struct vdso_data timens_page - __attribute__((visibility("hidden"))); - #define VDSO_HAS_TIME 1 #define VDSO_HAS_CLOCK_GETRES 1 @@ -59,14 +53,6 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return &timens_page; -} -#endif - #ifndef BUILD_VDSO32 static __always_inline @@ -250,7 +236,7 @@ static u64 vread_hvclock(void) #endif static inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) return (u64)rdtsc_ordered() & S64_MAX; @@ -275,12 +261,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, return U64_MAX; } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return &vvar_page; -} - -static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool arch_vdso_clocksource_ok(const struct vdso_clock *vc) { return true; } @@ -319,34 +300,34 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * declares everything with the MSB/Sign-bit set as invalid. Therefore the * effective mask is S64_MAX. */ -static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base) +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) { - u64 delta = cycles - vd->cycle_last; + u64 delta = cycles - vc->cycle_last; /* * Negative motion and deltas which can cause multiplication * overflow require special treatment. This check covers both as - * negative motion is guaranteed to be greater than @vd::max_cycles + * negative motion is guaranteed to be greater than @vc::max_cycles * due to unsigned comparison. * * Due to the MSB/Sign-bit being used as invalid marker (see * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that * case is also unlikely and will also take the unlikely path here. */ - if (unlikely(delta > vd->max_cycles)) { + if (unlikely(delta > vc->max_cycles)) { /* * Due to the above mentioned TSC wobbles, filter out * negative motion. Per the above masking, the effective * sign bit is now bit 62. */ if (delta & (1ULL << 62)) - return base >> vd->shift; + return base >> vc->shift; /* Handle multiplication overflow gracefully */ - return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift); + return mul_u64_u32_add_u64_shr(delta & S64_MAX, vc->mult, base, vc->shift); } - return ((delta * vd->mult) + base) >> vd->shift; + return ((delta * vc->mult) + base) >> vc->shift; } #define vdso_calc_ns vdso_calc_ns diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index 37b4a70559a82..ebbf63420af03 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -2,10 +2,10 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 640 -#define __VVAR_PAGES 4 +#define __VDSO_PAGES 6 #define VDSO_NR_VCLOCK_PAGES 2 +#define VDSO_VCLOCK_PAGES_START(_b) ((_b) + (__VDSO_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE) #define VDSO_PAGE_PVCLOCK_OFFSET 0 #define VDSO_PAGE_HVCLOCK_OFFSET 1 @@ -14,25 +14,6 @@ #include #include -extern struct vdso_data *vdso_data; - -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -static __always_inline -struct vdso_data *__x86_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __x86_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__x86_get_k_vdso_rng_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data - /* The asm-generic header needs to be included after the definitions above */ #include diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec37..5d471253c7554 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -15,8 +15,6 @@ #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,8 +31,6 @@ #define MODULE_PROC_FAMILY "K6 " #elif defined CONFIG_MK7 #define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9b82eebd7add5..dafbf581c515d 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -26,7 +26,7 @@ #define XLF_5LEVEL_ENABLED (1<<6) #define XLF_MEM_ENCRYPTION (1<<7) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -210,6 +210,6 @@ enum x86_hardware_subarch { X86_NR_SUBARCHS, }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 2f491efe3a126..55bc668671560 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -54,7 +54,7 @@ */ #define E820_RESERVED_KERN 128 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include struct e820entry { __u64 addr; /* start of memory segment */ @@ -76,7 +76,7 @@ struct e820map { #define BIOS_ROM_BASE 0xffe00000 #define BIOS_ROM_END 0xffffffff -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_E820_H */ diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index d62ac5db093b4..a82c039d8e6a7 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -12,7 +12,7 @@ /* The size of each LDT entry. */ #define LDT_ENTRY_SIZE 8 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS * not to the default values if you still want to do syscalls. This @@ -44,5 +44,5 @@ struct user_desc { #define MODIFY_LDT_CONTENTS_STACK 1 #define MODIFY_LDT_CONTENTS_CODE 2 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_LDT_H */ diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index e7516b402a00f..4b8917ca28fe7 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -2,7 +2,7 @@ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -10,5 +10,5 @@ #define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) #define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_MSR_H */ diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 16074b9c93bb5..5823584dea132 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,7 +25,7 @@ #else /* __i386__ */ -#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS) /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". @@ -57,7 +57,7 @@ #define EFLAGS 144 #define RSP 152 #define SS 160 -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* top of stack page */ #define FRAME_SIZE 168 @@ -87,7 +87,7 @@ #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #endif diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index 85165c0edafc8..e0b5b4f6226b1 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -7,7 +7,7 @@ #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __i386__ /* this struct defines the way the registers are stored on the @@ -81,6 +81,6 @@ struct pt_regs { -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h index b111b0c185449..50c45ead4e7c9 100644 --- a/arch/x86/include/uapi/asm/setup_data.h +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -18,7 +18,7 @@ #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -78,6 +78,6 @@ struct ima_setup_data { __u64 size; } __attribute__((packed)); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index f777346450ec3..1067efabf18b5 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -2,7 +2,7 @@ #ifndef _UAPI_ASM_X86_SIGNAL_H #define _UAPI_ASM_X86_SIGNAL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -16,7 +16,7 @@ struct siginfo; typedef unsigned long sigset_t; #endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define SIGHUP 1 @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # ifndef __KERNEL__ @@ -106,6 +106,6 @@ typedef struct sigaltstack { __kernel_size_t ss_size; } stack_t; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SIGNAL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b43eb7e384eba..84cfa179802c3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,8 @@ KCOV_INSTRUMENT_unwind_orc.o := n KCOV_INSTRUMENT_unwind_frame.o := n KCOV_INSTRUMENT_unwind_guess.o := n +CFLAGS_head32.o := -fno-stack-protector +CFLAGS_head64.o := -fno-stack-protector CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5854f0b8f0f19..d25584255ab82 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties @@ -205,6 +206,16 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_play_dead(percpu_entry->states[cx->index].eax); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead); + void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S index 4e498d28cdc85..aefb9cb583add 100644 --- a/arch/x86/kernel/acpi/madt_playdead.S +++ b/arch/x86/kernel/acpi/madt_playdead.S @@ -14,6 +14,7 @@ * rsi: PGD of the identity mapping */ SYM_FUNC_START(asm_acpi_mp_play_dead) + ANNOTATE_NOENDBR /* Turn off global entries. Following CR3 write will flush them. */ movq %cr4, %rdx andq $~(X86_CR4_PGE), %rdx diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index d5ef6215583bc..f36f28405dcc6 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -70,58 +70,6 @@ static void __init free_pgt_page(void *pgt, void *dummy) return memblock_free(pgt, PAGE_SIZE); } -/* - * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at - * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches - * to the identity mapping and the function has be present at the same spot in - * the virtual address space before and after switching page tables. - */ -static int __init init_transition_pgtable(pgd_t *pgd) -{ - pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; - unsigned long vaddr, paddr; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - vaddr = (unsigned long)asm_acpi_mp_play_dead; - pgd += pgd_index(vaddr); - if (!pgd_present(*pgd)) { - p4d = (p4d_t *)alloc_pgt_page(NULL); - if (!p4d) - return -ENOMEM; - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); - } - p4d = p4d_offset(pgd, vaddr); - if (!p4d_present(*p4d)) { - pud = (pud_t *)alloc_pgt_page(NULL); - if (!pud) - return -ENOMEM; - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(p4d, vaddr); - if (!pud_present(*pud)) { - pmd = (pmd_t *)alloc_pgt_page(NULL); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, vaddr); - if (!pmd_present(*pmd)) { - pte = (pte_t *)alloc_pgt_page(NULL); - if (!pte) - return -ENOMEM; - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); - } - pte = pte_offset_kernel(pmd, vaddr); - - paddr = __pa(vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); - - return 0; -} - static int __init acpi_mp_setup_reset(u64 reset_vector) { struct x86_mapping_info info = { @@ -130,6 +78,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) .page_flag = __PAGE_KERNEL_LARGE_EXEC, .kernpg_flag = _KERNPG_TABLE_NOENC, }; + unsigned long mstart, mend; pgd_t *pgd; pgd = alloc_pgt_page(NULL); @@ -137,8 +86,6 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) return -ENOMEM; for (int i = 0; i < nr_pfn_mapped; i++) { - unsigned long mstart, mend; - mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { @@ -147,14 +94,24 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) } } - if (kernel_ident_mapping_init(&info, pgd, - PAGE_ALIGN_DOWN(reset_vector), - PAGE_ALIGN(reset_vector + 1))) { + mstart = PAGE_ALIGN_DOWN(reset_vector); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { kernel_ident_mapping_free(&info, pgd); return -ENOMEM; } - if (init_transition_pgtable(pgd)) { + /* + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping + * at the same place as in the kernel page tables. + * asm_acpi_mp_play_dead() switches to the identity mapping and the + * function must be present at the same spot in the virtual address space + * before and after switching page tables. + */ + info.offset = __START_KERNEL_map - phys_base; + mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead)); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { kernel_ident_mapping_free(&info, pgd); return -ENOMEM; } diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b200a193beeb3..04f561f75e990 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,6 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) + ANNOTATE_NOENDBR movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c71b575bf2292..bf82c6f7d6905 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, - struct module *mod) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { - u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, } if (a->instrlen != 6 || - wr_instr[0] != CALL_RIP_REL_OPCODE || - wr_instr[1] != CALL_RIP_REL_MODRM) { + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(wr_instr + 2); + disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end, - struct module *mod) + struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); - wr_instr = module_writable_address(mod, instr); - replacement = (u8 *)&a->repl_offset + a->repl_offset; - wr_replacement = module_writable_address(mod, replacement); - BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, wr_instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(wr_instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, wr_replacement, a->replacementlen); + memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a, - mod); + insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } @@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(wr_instr, insn_buff, insn_buff_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -752,6 +741,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, op2 = insn.opcode.bytes[1]; switch (op1) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + /* See cfi_paranoid. */ + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: break; @@ -772,9 +766,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } @@ -810,8 +804,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; @@ -820,13 +813,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -846,41 +838,59 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } -#else -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } -#endif /* CONFIG_MITIGATION_RETHUNK */ +#else /* !CONFIG_MITIGATION_RETHUNK: */ +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr); +__noendbr bool is_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); + +Efault: + return false; +} -static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) +#ifdef CONFIG_FINEIBT + +static __noendbr bool exact_endbr(u32 *val) { - u32 endbr, poison = gen_endbr_poison(); + u32 endbr; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) - return; + __get_kernel_nofault(&endbr, val, u32, Efault); + return endbr == gen_endbr(); + +Efault: + return false; +} - if (!is_endbr(endbr)) { - WARN_ON_ONCE(warn); +#endif + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr) +{ + u32 poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(!is_endbr(addr))) return; - } DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); @@ -889,7 +899,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(wr_addr, &poison, 4); + text_poke_early(addr, &poison, 4); } /* @@ -898,36 +908,39 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, wr_addr, true); + poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16, wr_addr - 16); + poison_cfi(addr - 16); } } -#else +#else /* !CONFIG_X86_KERNEL_IBT: */ -void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif /* !CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT -#define __CFI_DEFAULT CFI_AUTO +# define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) -#define __CFI_DEFAULT CFI_KCFI +# define __CFI_DEFAULT CFI_KCFI #else -#define __CFI_DEFAULT CFI_OFF +# define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; +#ifdef CONFIG_FINEIBT_BHI +bool cfi_bhi __ro_after_init = false; +#endif + #ifdef CONFIG_CFI_CLANG struct bpf_insn; @@ -935,11 +948,7 @@ struct bpf_insn; extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); -/* - * Force a reference to the external symbol so the compiler generates - * __kcfi_typid. - */ -__ADDRESSABLE(__bpf_prog_runX); +KCFI_REFERENCE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( @@ -956,7 +965,7 @@ asm ( /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); -__ADDRESSABLE(__bpf_callback_fn); +KCFI_REFERENCE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( @@ -991,6 +1000,21 @@ u32 cfi_get_func_hash(void *func) return hash; } + +int cfi_get_func_arity(void *func) +{ + bhi_thunk *target; + s32 disp; + + if (cfi_mode != CFI_FINEIBT && !cfi_bhi) + return 0; + + if (get_kernel_nofault(disp, func - 4)) + return 0; + + target = func + disp; + return target - __bhi_args; +} #endif #ifdef CONFIG_FINEIBT @@ -998,6 +1022,8 @@ u32 cfi_get_func_hash(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; +static bool cfi_paranoid __ro_after_init = false; + /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1005,7 +1031,7 @@ static u32 cfi_seed __ro_after_init; static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; - while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) @@ -1037,6 +1063,25 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; + } else if (!strcmp(str, "warn")) { + pr_alert("CFI mismatch non-fatal!\n"); + cfi_warn = true; + } else if (!strcmp(str, "paranoid")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_paranoid = true; + } else { + pr_err("Ignoring paranoid; depends on fineibt.\n"); + } + } else if (!strcmp(str, "bhi")) { +#ifdef CONFIG_FINEIBT_BHI + if (cfi_mode == CFI_FINEIBT) { + cfi_bhi = true; + } else { + pr_err("Ignoring bhi; depends on fineibt.\n"); + } +#else + pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); +#endif } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -1054,9 +1099,9 @@ early_param("cfi", cfi_parse_cmdline); * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 - * nop jz 1f // 2 - * nop ud2 // 2 - * nop 1: nop // 1 + * nop jne __cfi_\func+6 // 2 + * nop nop3 // 3 + * nop * nop * nop * nop @@ -1068,34 +1113,53 @@ early_param("cfi", cfi_parse_cmdline); * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 - * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 - * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * */ -asm( ".pushsection .rodata \n" - "fineibt_preamble_start: \n" - " endbr64 \n" - " subl $0x12345678, %r10d \n" - " je fineibt_preamble_end \n" - " ud2 \n" - " nop \n" - "fineibt_preamble_end: \n" +/* + * : + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 78 56 34 12 sub $0x12345678, %r10d + * b: 75 f9 jne 6 + * d: 0f 1f 00 nopl (%rax) + * + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as + * (bad) on x86_64 and raises #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + "fineibt_preamble_bhi: \n" + " jne fineibt_preamble_start+6 \n" + ASM_NOP3 + "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) +#define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 +/* + * : + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * a: 0f 1f 40 00 nopl 0x0(%rax) + */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" - " sub $16, %r11 \n" + " lea -0x10(%r11), %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" @@ -1109,13 +1173,62 @@ extern u8 fineibt_caller_end[]; #define fineibt_caller_jmp (fineibt_caller_size - 2) -static u32 decode_preamble_hash(void *addr) +/* + * Since FineIBT does hash validation on the callee side it is prone to + * circumvention attacks where a 'naked' ENDBR instruction exists that + * is not part of the fineibt_preamble sequence. + * + * Notably the x86 entry points must be ENDBR and equally cannot be + * fineibt_preamble. + * + * The fineibt_paranoid caller sequence adds additional caller side + * hash validation. This stops such circumvention attacks dead, but at the cost + * of adding a load. + * + * : + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b lea -0x10(%r11), %r11 + * e: 75 fd jne d + * 10: 41 ff d3 call *%r11 + * 13: 90 nop + * + * Notably LEA does not modify flags and can be reordered with the CMP, + * avoiding a dependency. Again, using a non-taken (backwards) branch + * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the + * Jcc.d8, causing #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_paranoid_start: \n" + " movl $0x12345678, %r10d \n" + " cmpl -9(%r11), %r10d \n" + " lea -0x10(%r11), %r11 \n" + " jne fineibt_paranoid_start+0xd \n" + "fineibt_paranoid_ind: \n" + " call *%r11 \n" + " nop \n" + "fineibt_paranoid_end: \n" + ".popsection \n" +); + +extern u8 fineibt_paranoid_start[]; +extern u8 fineibt_paranoid_ind[]; +extern u8 fineibt_paranoid_end[]; + +#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) +#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) +#define fineibt_paranoid_ud 0xd + +static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; - /* b8 78 56 34 12 mov $0x12345678,%eax */ - if (p[0] == 0xb8) + /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ + if (p[0] >= 0xb8 && p[0] < 0xc0) { + if (reg) + *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); + } return 0; /* invalid hash value */ } @@ -1124,11 +1237,11 @@ static u32 decode_caller_hash(void *addr) { u8 *p = addr; - /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); - /* e8 0c 78 56 34 12 jmp.d8 +12 */ + /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); @@ -1136,7 +1249,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1148,23 +1261,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, jmp, 2); + text_poke_early(addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1174,126 +1284,212 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, mov, 2); + text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(wr_addr + 1, &hash, 4); + text_poke_early(addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) +static void cfi_fineibt_bhi_preamble(void *addr, int arity) +{ + if (!arity) + return; + + if (!cfi_warn && arity == 1) { + /* + * Crazy scheme to allow arity-1 inline: + * + * __cfi_foo: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 78 56 34 12 sub 0x12345678, %r10d + * b: 49 0f 45 fa cmovne %r10, %rdi + * f: 75 f5 jne __cfi_foo+6 + * 11: 0f 1f 00 nopl (%rax) + * + * Code that direct calls to foo()+0, decodes the tail end as: + * + * foo: + * 0: f5 cmc + * 1: 0f 1f 00 nopl (%rax) + * + * which clobbers CF, but does not affect anything ABI + * wise. + * + * Notably, this scheme is incompatible with permissive CFI + * because the CMOVcc is unconditional and RDI will have been + * clobbered. + */ + const u8 magic[9] = { + 0x49, 0x0f, 0x45, 0xfa, + 0x75, 0xf5, + BYTES_NOP3, + }; + + text_poke_early(addr + fineibt_preamble_bhi, magic, 9); + + return; + } + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); +} + +static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); + int arity; u32 hash; - hash = decode_preamble_hash(wr_addr); + /* + * When the function doesn't start with ENDBR the compiler will + * have determined there are no indirect calls to it and we + * don't need no CFI either. + */ + if (!is_endbr(addr + 16)) + continue; + + hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + + WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, + "kCFI preamble has wrong register at: %pS %*ph\n", + addr, 5, addr); + + if (cfi_bhi) + cfi_fineibt_bhi_preamble(addr, arity); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) +static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr + 16, wr_addr + 16, false); + if (!exact_endbr(addr + 16)) + continue; + + poison_endbr(addr + 16); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(wr_addr + 2, &hash, 4); + text_poke_early(addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; + BUG_ON(fineibt_paranoid_size != 20); + for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; + struct insn insn; + u8 bytes[20]; u32 hash; + int ret; + u8 op; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - if (hash) { - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); + hash = decode_caller_hash(addr); + if (!hash) + continue; + + if (!cfi_paranoid) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + /* rely on apply_retpolines() */ + continue; + } + + /* cfi_paranoid */ + ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { + WARN_ON_ONCE(1); + continue; } - /* rely on apply_retpolines() */ + + memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); + memcpy(bytes + fineibt_caller_hash, &hash, 4); + + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + + text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { - bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1302,8 +1498,15 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; - if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { + /* + * FRED has much saner context on exception entry and + * is less easy to take advantage of. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + cfi_paranoid = true; cfi_mode = CFI_FINEIBT; + } } /* @@ -1311,7 +1514,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1322,11 +1525,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } @@ -1338,7 +1541,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1348,20 +1551,23 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); + ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi, mod); + cfi_rewrite_endbr(start_cfi, end_cfi); - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) { + pr_info("Using %sFineIBT%s CFI\n", + cfi_paranoid ? "paranoid " : "", + cfi_bhi ? "+BHI" : ""); + } return; default: @@ -1377,10 +1583,24 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr, void *wr_addr) +static void poison_cfi(void *addr) { + /* + * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, + * some (static) functions for which they can determine the address + * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. + * + * As such, these functions will get sealed, but we need to be careful + * to not unconditionally scribble the previous function. + */ switch (cfi_mode) { case CFI_FINEIBT: + /* + * FineIBT prefix should start with an ENDBR. + */ + if (!is_endbr(addr)) + break; + /* * __cfi_\func: * osp nopl (%rax) @@ -1389,17 +1609,23 @@ static void poison_cfi(void *addr, void *wr_addr) * ud2 * 1: nop */ - poison_endbr(addr, wr_addr, false); - poison_hash(wr_addr + fineibt_preamble_hash); + poison_endbr(addr); + poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: + /* + * kCFI prefix should start with a valid hash. + */ + if (!decode_preamble_hash(addr, NULL)) + break; + /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(wr_addr + 1); + poison_hash(addr + 1); break; default: @@ -1407,24 +1633,135 @@ static void poison_cfi(void *addr, void *wr_addr) } } -#else +/* + * When regs->ip points to a 0xEA byte in the FineIBT preamble, + * return true and fill out target and type. + * + * We check the preamble by checking for the ENDBR instruction relative to the + * 0xEA instruction. + */ +static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_preamble_ud; + u32 hash; + + if (!exact_endbr((void *)addr)) + return false; + + *target = addr + fineibt_preamble_size; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * Since regs->ip points to the middle of an instruction; it cannot + * continue with the normal fixup. + */ + regs->ip = *target; + + return true; + +Efault: + return false; +} + +/* + * regs->ip points to one of the UD2 in __bhi_args[]. + */ +static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr; + u32 hash; + + if (!cfi_bhi) + return false; + + if (regs->ip < (unsigned long)__bhi_args || + regs->ip >= (unsigned long)__bhi_args_end) + return false; + + /* + * Fetch the return address from the stack, this points to the + * FineIBT preamble. Since the CALL instruction is in the 5 last + * bytes of the preamble, the return address is in fact the target + * address. + */ + __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); + *target = addr; + + addr -= fineibt_preamble_size; + if (!exact_endbr((void *)addr)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * The UD2 sites are constructed with a RET immediately following, + * as such the non-fatal case can use the regular fixup. + */ + return true; + +Efault: + return false; +} + +/* + * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] + * sequence. + */ +static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_paranoid_ud; + u32 hash; + + if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + +Efault: + return false; +} + +bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + if (decode_fineibt_paranoid(regs, target, type)) + return true; + + if (decode_fineibt_bhi(regs, target, type)) + return true; + + return decode_fineibt_preamble(regs, target, type); +} + +#else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr) { } +static void poison_cfi(void *addr) { } #endif -#endif +#endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, mod); + start_cfi, end_cfi, + /* .builtin = */ false); } #ifdef CONFIG_SMP @@ -1721,27 +2058,27 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, NULL); + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); - apply_returns(__return_sites, __return_sites_end, NULL); - - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); /* - * Now all calls are established. Apply the call thunks if - * required. + * Adjust all CALL instructions to point to func()-10, including + * those in .altinstr_replacement. */ callthunks_patch_builtin_calls(); + apply_alternatives(__alt_instructions, __alt_instructions_end); + /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 11fac09e3a8cb..6d12a9b694327 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -73,7 +73,6 @@ static int amd_cache_northbridges(void) amd_northbridges.nb = nb; for (i = 0; i < amd_northbridges.num; i++) { - node_to_amd_nb(i)->root = amd_node_get_root(i); node_to_amd_nb(i)->misc = amd_node_get_func(i, 3); /* @@ -143,7 +142,6 @@ bool __init early_is_amd_nb(u32 device) struct resource *amd_get_mmconfig_range(struct resource *res) { - u32 address; u64 base, msr; unsigned int segn_busn_bits; @@ -151,13 +149,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return NULL; - /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */ + if (boot_cpu_data.x86 < 0x10 || + rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) return NULL; - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - /* mmconfig is not enabled */ if (!(msr & FAM10H_MMIO_CONF_ENABLE)) return NULL; diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index d2ec7fd555c51..b670fa85c61b4 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -8,6 +8,7 @@ * Author: Yazen Ghannam */ +#include #include /* @@ -93,10 +94,14 @@ static struct pci_dev **amd_roots; /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); +static bool smn_exclusive; #define SMN_INDEX_OFFSET 0x60 #define SMN_DATA_OFFSET 0x64 +#define HSMP_INDEX_OFFSET 0xc4 +#define HSMP_DATA_OFFSET 0xc8 + /* * SMN accesses may fail in ways that are difficult to detect here in the called * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do @@ -146,6 +151,9 @@ static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, b if (!root) return err; + if (!smn_exclusive) + return err; + guard(mutex)(&smn_mutex); err = pci_write_config_dword(root, i_off, address); @@ -179,6 +187,93 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write); +} +EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr); + +static struct dentry *debugfs_dir; +static u16 debug_node; +static u32 debug_address; + +static ssize_t smn_node_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u16 node; + int ret; + + ret = kstrtou16_from_user(userbuf, count, 0, &node); + if (ret) + return ret; + + if (node >= amd_num_nodes()) + return -ENODEV; + + debug_node = node; + return count; +} + +static int smn_node_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_node); + return 0; +} + +static ssize_t smn_address_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &debug_address); + if (ret) + return ret; + + return count; +} + +static int smn_address_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_address); + return 0; +} + +static int smn_value_show(struct seq_file *m, void *v) +{ + u32 val; + int ret; + + ret = amd_smn_read(debug_node, debug_address, &val); + if (ret) + return ret; + + seq_printf(m, "0x%08x\n", val); + return 0; +} + +static ssize_t smn_value_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u32 val; + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &val); + if (ret) + return ret; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + ret = amd_smn_write(debug_node, debug_address, val); + if (ret) + return ret; + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(smn_node); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_address); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_value); + static int amd_cache_roots(void) { u16 node, num_nodes = amd_num_nodes(); @@ -193,6 +288,48 @@ static int amd_cache_roots(void) return 0; } +static int reserve_root_config_spaces(void) +{ + struct pci_dev *root = NULL; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus))) { + /* Root device is Device 0 Function 0 on each Primary Bus. */ + root = pci_get_slot(bus, 0); + if (!root) + continue; + + if (root->vendor != PCI_VENDOR_ID_AMD && + root->vendor != PCI_VENDOR_ID_HYGON) + continue; + + pci_dbg(root, "Reserving PCI config space\n"); + + /* + * There are a few SMN index/data pairs and other registers + * that shouldn't be accessed by user space. + * So reserve the entire PCI config space for simplicity rather + * than covering specific registers piecemeal. + */ + if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { + pci_err(root, "Failed to reserve config space\n"); + return -EEXIST; + } + } + + smn_exclusive = true; + return 0; +} + +static bool enable_dfs; + +static int __init amd_smn_enable_dfs(char *str) +{ + enable_dfs = true; + return 1; +} +__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs); + static int __init amd_smn_init(void) { int err; @@ -209,6 +346,18 @@ static int __init amd_smn_init(void) if (err) return err; + err = reserve_root_config_spaces(); + if (err) + return err; + + if (enable_dfs) { + debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir); + + debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops); + debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops); + debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops); + } + return 0; } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3bf0487cf3b73..52d1808ee360b 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o endif -# APIC probe will depend on the listing order here -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o - # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e893dc6f11c13..ddca8da6d4680 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1371,8 +1371,6 @@ void __init apic_intr_mode_init(void) x86_64_probe_apic(); - x86_32_install_bigsmp(); - if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1674,7 +1672,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } topology_register_boot_apic(boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c deleted file mode 100644 index 9285d500d5b44..0000000000000 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. - * - * Drives the local APIC in "clustered mode". - */ -#include -#include -#include - -#include -#include - -#include "local.h" - -static u32 bigsmp_get_apic_id(u32 x) -{ - return (x >> 24) & 0xFF; -} - -static void bigsmp_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void bigsmp_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - -static int dmi_bigsmp; /* can be set by dmi scanners */ - -static int hp_ht_bigsmp(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); - dmi_bigsmp = 1; - - return 0; -} - - -static const struct dmi_system_id bigsmp_dmi_table[] = { - { hp_ht_bigsmp, "HP ProLiant DL760 G2", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P44-"), - } - }, - - { hp_ht_bigsmp, "HP ProLiant DL740", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P47-"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - -static int probe_bigsmp(void) -{ - return dmi_check_system(bigsmp_dmi_table); -} - -static struct apic apic_bigsmp __ro_after_init = { - - .name = "bigsmp", - .probe = probe_bigsmp, - - .dest_mode_logical = false, - - .disable_esr = 1, - - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = bigsmp_get_apic_id, - - .calc_dest_apicid = apic_default_calc_apicid, - - .send_IPI = default_send_IPI_single_phys, - .send_IPI_mask = default_send_IPI_mask_sequence_phys, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = bigsmp_send_IPI_allbutself, - .send_IPI_all = bigsmp_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -bool __init apic_bigsmp_possible(bool cmdline_override) -{ - return apic == &apic_bigsmp || !cmdline_override; -} - -void __init apic_bigsmp_force(void) -{ - if (apic != &apic_bigsmp) - apic_install_driver(&apic_bigsmp); -} - -apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 5da693d633b78..98a57cb4aa861 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -23,7 +24,7 @@ __setup("no_ipi_broadcast=", apic_ipi_shorthand); static int __init print_ipi_mode(void) { pr_info("IPI shorthand broadcast: %s\n", - apic_ipi_shorthand_off ? "disabled" : "enabled"); + str_disabled_enabled(apic_ipi_shorthand_off)); return 0; } late_initcall(print_ipi_mode); @@ -287,34 +288,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } - -#ifdef CONFIG_SMP -static int convert_apicid_to_cpu(u32 apic_id) -{ - int i; - - for_each_possible_cpu(i) { - if (per_cpu(x86_cpu_to_apicid, i) == apic_id) - return i; - } - return -1; -} - -int safe_smp_processor_id(void) -{ - u32 apicid; - int cpuid; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - - apicid = read_apic_id(); - if (apicid == BAD_APICID) - return 0; - - cpuid = convert_apicid_to_cpu(apicid); - - return cpuid >= 0 ? cpuid : 0; -} -#endif #endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 842fe28496bee..bdcf609eb2835 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -65,17 +65,4 @@ void default_send_IPI_self(int vector); void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); -void x86_32_probe_bigsmp_early(void); -void x86_32_install_bigsmp(void); -#else -static inline void x86_32_probe_bigsmp_early(void) { } -static inline void x86_32_install_bigsmp(void) { } -#endif - -#ifdef CONFIG_X86_BIGSMP -bool apic_bigsmp_possible(bool cmdline_selected); -void apic_bigsmp_force(void); -#else -static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; -static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f75ee345c02d0..87bc9e7ca5d60 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -93,35 +93,6 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init x86_32_probe_bigsmp_early(void) -{ - if (nr_cpu_ids <= 8 || xen_pv_domain()) - return; - - if (IS_ENABLED(CONFIG_X86_BIGSMP)) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(boot_cpu_apic_version)) - break; - /* P4 and above */ - fallthrough; - case X86_VENDOR_HYGON: - case X86_VENDOR_AMD: - if (apic_bigsmp_possible(cmdline_apic)) - return; - break; - } - } - pr_info("Limiting to 8 possible CPUs\n"); - set_nr_cpu_ids(8); -} - -void __init x86_32_install_bigsmp(void) -{ - if (nr_cpu_ids > 8 && !xen_pv_domain()) - apic_bigsmp_force(); -} - void __init x86_32_probe_apic(void) { if (!cmdline_apic) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 736f62812f5c2..72fa4bb78f0a6 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd, return err ? err : IRQ_SET_MASK_OK; } +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; + + /* + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. + */ + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; +} + +/* + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. + */ +static void apic_force_complete_move(struct irq_data *irqd) +{ + unsigned int cpu = smp_processor_id(); + struct apic_chip_data *apicd; + unsigned int vector; + + guard(raw_spinlock)(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + return; + + /* + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. + */ + vector = apicd->prev_vector; + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) + return; + + /* + * This is tricky. If the cleanup of the old vector has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. + * + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (apicd->move_in_progress) { + /* + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendezvous in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theoretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. + */ + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqd->irq, vector); + } + free_moved_vector(apicd); +} + #else -# define apic_set_affinity NULL +# define apic_set_affinity NULL +# define apic_force_complete_move NULL #endif static int apic_retrigger_irq(struct irq_data *irqd) @@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data, } static struct irq_chip lapic_controller = { - .name = "APIC", - .irq_ack = apic_ack_edge, - .irq_set_affinity = apic_set_affinity, - .irq_compose_msi_msg = x86_vector_msi_compose_msg, - .irq_retrigger = apic_retrigger_irq, + .name = "APIC", + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, + .irq_force_complete_move = apic_force_complete_move, + .irq_retrigger = apic_retrigger_irq, }; #ifdef CONFIG_SMP -static void free_moved_vector(struct apic_chip_data *apicd) -{ - unsigned int vector = apicd->prev_vector; - unsigned int cpu = apicd->prev_cpu; - bool managed = apicd->is_managed; - - /* - * Managed interrupts are usually not migrated away - * from an online CPU, but CPU isolation 'managed_irq' - * can make that happen. - * 1) Activation does not take the isolation into account - * to keep the code simple - * 2) Migration away from an isolated CPU can happen when - * a non-isolated CPU which is in the calculated - * affinity mask comes online. - */ - trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - hlist_del_init(&apicd->clist); - apicd->prev_vector = 0; - apicd->move_in_progress = 0; -} - static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { struct apic_chip_data *apicd; @@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg) __vector_schedule_cleanup(apicd); } -/* - * Called from fixup_irqs() with @desc->lock held and interrupts disabled. - */ -void irq_force_complete_move(struct irq_desc *desc) -{ - unsigned int cpu = smp_processor_id(); - struct apic_chip_data *apicd; - struct irq_data *irqd; - unsigned int vector; - - /* - * The function is called for all descriptors regardless of which - * irqdomain they belong to. For example if an IRQ is provided by - * an irq_chip as part of a GPIO driver, the chip data for that - * descriptor is specific to the irq_chip in question. - * - * Check first that the chip_data is what we expect - * (apic_chip_data) before touching it any further. - */ - irqd = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); - if (!irqd) - return; - - raw_spin_lock(&vector_lock); - apicd = apic_chip_data(irqd); - if (!apicd) - goto unlock; - - /* - * If prev_vector is empty or the descriptor is neither currently - * nor previously on the outgoing CPU no action required. - */ - vector = apicd->prev_vector; - if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) - goto unlock; - - /* - * This is tricky. If the cleanup of the old vector has not been - * done yet, then the following setaffinity call will fail with - * -EBUSY. This can leave the interrupt in a stale state. - * - * All CPUs are stuck in stop machine with interrupts disabled so - * calling __irq_complete_move() would be completely pointless. - * - * 1) The interrupt is in move_in_progress state. That means that we - * have not seen an interrupt since the io_apic was reprogrammed to - * the new vector. - * - * 2) The interrupt has fired on the new vector, but the cleanup IPIs - * have not been processed yet. - */ - if (apicd->move_in_progress) { - /* - * In theory there is a race: - * - * set_ioapic(new_vector) <-- Interrupt is raised before update - * is effective, i.e. it's raised on - * the old vector. - * - * So if the target cpu cannot handle that interrupt before - * the old vector is cleaned up, we get a spurious interrupt - * and in the worst case the ioapic irq line becomes stale. - * - * But in case of cpu hotplug this should be a non issue - * because if the affinity update happens right before all - * cpus rendezvous in stop machine, there is no way that the - * interrupt can be blocked on the target cpu because all cpus - * loops first with interrupts enabled in stop machine, so the - * old vector is not yet cleaned up when the interrupt fires. - * - * So the only way to run into this issue is if the delivery - * of the interrupt on the apic/system bus would be delayed - * beyond the point where the target cpu disables interrupts - * in stop machine. I doubt that it can happen, but at least - * there is a theoretical chance. Virtualization might be - * able to expose this, but AFAICT the IOAPIC emulation is not - * as stupid as the real hardware. - * - * Anyway, there is nothing we can do about that at this point - * w/o refactoring the whole fixup_irq() business completely. - * We print at least the irq number and the old vector number, - * so we have the necessary information when a problem in that - * area arises. - */ - pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqd->irq, vector); - } - free_moved_vector(apicd); -unlock: - raw_spin_unlock(&vector_lock); -} - #ifdef CONFIG_HOTPLUG_CPU /* * Note, this is not accurate accounting, but at least good enough to diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a98020bf31bb4..ad4ea6fb3b6cb 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -107,11 +107,6 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); - OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); - OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING - OFFSET(X86_call_depth, pcpu_hot, call_depth); -#endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) /* Offset for fields in aria_ctx */ BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bb65371ea9df5..590b6cd0eac0e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -54,11 +54,5 @@ int main(void) BLANK(); #undef ENTRY - BLANK(); - -#ifdef CONFIG_STACKPROTECTOR - OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); - BLANK(); -#endif return 0; } diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 3fed7ae58b603..73274d76ce167 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,27 +21,13 @@ int sbf_port __initdata = -1; /* set via acpi_boot_init() */ -static int __init parity(u8 v) -{ - int x = 0; - int i; - - for (i = 0; i < 8; i++) { - x ^= (v & 1); - v >>= 1; - } - - return x; -} - static void __init sbf_write(u8 v) { unsigned long flags; if (sbf_port != -1) { - v &= ~SBF_PARITY; - if (!parity(v)) - v |= SBF_PARITY; + if (!parity8(v)) + v ^= SBF_PARITY; printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); @@ -66,14 +53,14 @@ static u8 __init sbf_read(void) return v; } -static int __init sbf_value_valid(u8 v) +static bool __init sbf_value_valid(u8 v) { if (v & SBF_RESERVED) /* Reserved bits */ - return 0; - if (!parity(v)) - return 0; + return false; + if (!parity8(v)) + return false; - return 1; + return true; } static int __init sbf_init(void) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 8418a892d195a..25ae542501124 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -239,22 +239,11 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) patch_call((void *)s + *s, ct); } -static __init_or_module void -patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, - const struct core_text *ct) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) - patch_call((void *)&a->instr_offset + a->instr_offset, ct); -} - static __init_or_module void callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -263,8 +252,6 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .alt_start = __alt_instructions, - .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c index e6bf78fac1462..77086cf565ec1 100644 --- a/arch/x86/kernel/cfi.c +++ b/arch/x86/kernel/cfi.c @@ -67,16 +67,30 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, */ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { - unsigned long target; + unsigned long target, addr = regs->ip; u32 type; - if (!is_cfi_trap(regs->ip)) - return BUG_TRAP_TYPE_NONE; + switch (cfi_mode) { + case CFI_KCFI: + if (!is_cfi_trap(addr)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, addr); + + break; - if (!decode_cfi_insn(regs, &target, &type)) - return report_cfi_failure_noaddr(regs, regs->ip); + case CFI_FINEIBT: + if (!decode_fineibt_insn(regs, &target, &type)) + return BUG_TRAP_TYPE_NONE; + + break; + + default: + return BUG_TRAP_TYPE_NONE; + } - return report_cfi_failure(regs, regs->ip, &target, type); + return report_cfi_failure(regs, addr, &target, type); } /* diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 54194f5995de3..79569f72b8ee5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -29,6 +29,8 @@ #include "cpu.h" +u16 invlpgb_count_max __ro_after_init; + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -632,7 +634,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); if (!rdmsrl_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); @@ -1073,6 +1075,10 @@ static void init_amd(struct cpuinfo_x86 *c) /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + + /* Enable Translation Cache Extension */ + if (cpu_has(c, X86_FEATURE_TCE)) + msr_set_bit(MSR_EFER, _EFER_TCE); } #ifdef CONFIG_X86_32 @@ -1105,8 +1111,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1119,26 +1125,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; + + tlb_lli_4m = tlb_lli_2m >> 1; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + /* Max number of pages INVLPGB can invalidate in one shot */ + if (cpu_has(c, X86_FEATURE_INVLPGB)) + invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a5d0998d76049..4386aa6c69e12 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -113,6 +113,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +/* Control IBPB on vCPU load */ +DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); +EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); + /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -234,7 +238,7 @@ static void x86_amd_ssb_disable(void) /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -243,6 +247,40 @@ static const char * const mds_strings[] = { [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_AUTO, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF; + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_AUTO, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF; + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_AUTO, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -250,6 +288,9 @@ static void __init mds_select_mitigation(void) return; } + if (mds_mitigation == MDS_MITIGATION_AUTO) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; @@ -286,16 +327,6 @@ early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt -enum taa_mitigations { - TAA_MITIGATION_OFF, - TAA_MITIGATION_UCODE_NEEDED, - TAA_MITIGATION_VERW, - TAA_MITIGATION_TSX_DISABLED, -}; - -/* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -386,15 +417,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt -enum mmio_mitigations { - MMIO_MITIGATION_OFF, - MMIO_MITIGATION_UCODE_NEEDED, - MMIO_MITIGATION_VERW, -}; - -/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -483,16 +505,6 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Register File Data Sampling: " fmt -enum rfds_mitigations { - RFDS_MITIGATION_OFF, - RFDS_MITIGATION_VERW, - RFDS_MITIGATION_UCODE_NEEDED, -}; - -/* Default mitigation for Register File Data Sampling */ -static enum rfds_mitigations rfds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; - static const char * const rfds_strings[] = { [RFDS_MITIGATION_OFF] = "Vulnerable", [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", @@ -508,6 +520,9 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else @@ -1293,9 +1308,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { + enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; + mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? + SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; @@ -1308,7 +1327,7 @@ spectre_v2_parse_user_cmdline(void) ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + return mode; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1318,8 +1337,8 @@ spectre_v2_parse_user_cmdline(void) } } - pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + pr_err("Unknown user space protection option (%s). Switching to default\n", arg); + return mode; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1331,16 +1350,11 @@ static void __init spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - bool smt_possible = IS_ENABLED(CONFIG_SMP); enum spectre_v2_user_cmd cmd; if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) - smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: @@ -1364,7 +1378,7 @@ spectre_v2_user_select_mitigation(void) /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + static_branch_enable(&switch_vcpu_ibpb); spectre_v2_user_ibpb = mode; switch (cmd) { @@ -1401,7 +1415,7 @@ spectre_v2_user_select_mitigation(void) * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || - !smt_possible || + !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; @@ -1973,6 +1987,7 @@ void cpu_bugs_smt_update(void) switch (mds_mitigation) { case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: case MDS_MITIGATION_VMWERV: if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) pr_warn_once(MDS_MSG_SMT); @@ -1984,6 +1999,7 @@ void cpu_bugs_smt_update(void) switch (taa_mitigation) { case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: case TAA_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(TAA_MSG_SMT); @@ -1995,6 +2011,7 @@ void cpu_bugs_smt_update(void) switch (mmio_mitigation) { case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: case MMIO_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(MMIO_MSG_SMT); @@ -2522,6 +2539,7 @@ enum srso_mitigation { SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, SRSO_MITIGATION_IBPB_ON_VMEXIT, + SRSO_MITIGATION_BP_SPEC_REDUCE, }; enum srso_mitigation_cmd { @@ -2539,7 +2557,8 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only", + [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2578,7 +2597,7 @@ static void __init srso_select_mitigation(void) srso_cmd == SRSO_CMD_OFF) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; - return; + goto out; } if (has_microcode) { @@ -2590,7 +2609,7 @@ static void __init srso_select_mitigation(void) */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - return; + goto out; } if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { @@ -2670,6 +2689,12 @@ static void __init srso_select_mitigation(void) ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: + if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { + pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); + srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; + break; + } + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); @@ -2691,7 +2716,15 @@ static void __init srso_select_mitigation(void) } out: - pr_info("%s\n", srso_strings[srso_mitigation]); + /* + * Clear the feature flag if this mitigation is not selected as that + * feature flag controls the BpSpecReduce MSR bit toggling in KVM. + */ + if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) + setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); + + if (srso_mitigation != SRSO_MITIGATION_NONE) + pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 6cba85c79d42d..97222efb4d2a6 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -192,7 +192,13 @@ static void __split_lock_reenable(struct work_struct *work) { sld_update_msr(true); } -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); +/* + * In order for each CPU to schedule its delayed work independently of the + * others, delayed work struct must be per-CPU. This is not required when + * sysctl_sld_mitigate is enabled because of the semaphore that limits + * the number of simultaneously scheduled delayed works to 1. + */ +static DEFINE_PER_CPU(struct delayed_work, sl_reenable); /* * If a CPU goes offline with pending delayed work to re-enable split lock @@ -213,7 +219,7 @@ static int splitlock_cpu_offline(unsigned int cpu) static void split_lock_warn(unsigned long ip) { - struct delayed_work *work; + struct delayed_work *work = NULL; int cpu; if (!current->reported_split_lock) @@ -235,11 +241,17 @@ static void split_lock_warn(unsigned long ip) if (down_interruptible(&buslock_sem) == -EINTR) return; work = &sl_reenable_unlock; - } else { - work = &sl_reenable; } cpu = get_cpu(); + + if (!work) { + work = this_cpu_ptr(&sl_reenable); + /* Deferred initialization of per-CPU struct */ + if (!work->work.func) + INIT_DELAYED_WORK(work, __split_lock_reenable); + } + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index e6fa03ed9172c..b3a520959b510 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -8,21 +8,19 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include #include +#include #include #include -#include -#include -#include #include #include +#include -#include -#include #include -#include +#include +#include #include +#include #include #include "cpu.h" @@ -31,7 +29,6 @@ #define LVL_1_DATA 2 #define LVL_2 3 #define LVL_3 4 -#define LVL_TRACE 5 /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -96,10 +93,6 @@ static const struct _cache_table cache_table[] = { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -787,19 +780,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache - */ - if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { + + /* Don't use CPUID(2) if CPUID(4) is supported. */ + if (!ci->num_leaves && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (ci->num_leaves && c->x86 == 15) - only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; @@ -808,7 +795,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; @@ -820,8 +807,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) /* look up this descriptor in the table */ while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7cce91b19fb2c..ae873a742f890 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -667,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -846,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -860,12 +860,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } void get_cpu_vendor(struct cpuinfo_x86 *c) @@ -1164,7 +1162,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), @@ -1331,8 +1329,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) { setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && @@ -1479,15 +1479,96 @@ static void detect_nopl(void) #endif } +static inline bool parse_set_clear_cpuid(char *arg, bool set) +{ + char *opt; + int taint = 0; + + while (arg) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&arg, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" %d:%d\n", bit >> 5, bit & 31); + else + pr_cont(" %s\n", x86_cap_flags[bit]); + + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + const char *kind; + + if (bit < 32 * NCAPINTS) { + flag = x86_cap_flags[bit]; + kind = "feature"; + } else { + kind = "bug"; + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } + + if (!flag) + continue; + + if (strcmp(flag, opt)) + continue; + + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); + setup_clear_cpu_cap(bit); + } + taint++; + found = true; + break; + } + + if (!found) + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); + } + + return taint; +} + + /* * We parse cpu parameters early because fpu__init_system() is executed * before parse_early_param(). */ static void __init cpu_parse_early_param(void) { + bool cpuid_taint = false; char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; + int arglen; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1519,61 +1600,17 @@ static void __init cpu_parse_early_param(void) setup_clear_cpu_cap(X86_FEATURE_FRED); arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, false); - pr_info("Clearing CPUID bits:"); - - while (argptr) { - bool found __maybe_unused = false; - unsigned int bit; + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, true); - opt = strsep(&argptr, ","); - - /* - * Handle naked numbers first for feature flags which don't - * have names. - */ - if (!kstrtouint(opt, 10, &bit)) { - if (bit < NCAPINTS * 32) { - - /* empty-string, i.e., ""-defined feature flags */ - if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); - else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); - - setup_clear_cpu_cap(bit); - taint++; - } - /* - * The assumption is that there are no feature names with only - * numbers in the name thus go to the next argument. - */ - continue; - } - - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) - continue; - - if (strcmp(x86_cap_flag(bit), opt)) - continue; - - pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); - taint++; - found = true; - break; - } - - if (!found) - pr_cont(" (unknown: %s)", opt); - } - pr_cont("\n"); - - if (taint) + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* @@ -1962,9 +1999,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -1975,6 +2018,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) @@ -2005,27 +2049,40 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); -DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { - .current_task = &init_task, - .preempt_count = INIT_PREEMPT_COUNT, - .top_of_stack = TOP_OF_INIT_STACK, -}; -EXPORT_PER_CPU_SYMBOL(pcpu_hot); -EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + +DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +EXPORT_PER_CPU_SYMBOL(const_current_task); + +DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, - fixed_percpu_data) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); +/* + * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING + * so that this space is reserved in the hot cache section even when the + * mitigation is disabled. + */ +DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); +EXPORT_PER_CPU_SYMBOL(__x86_call_depth); static void wrmsrl_cstar(unsigned long val) { @@ -2089,18 +2146,15 @@ void syscall_init(void) if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); } - -#else /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); #ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -#endif /* CONFIG_X86_64 */ - /* * Clear all 6 debug registers: */ diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1beccefbaff9a..51deb60a9d267 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 8bd84114c2d96..df838e3bdbe02 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -45,6 +45,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, { X86_FEATURE_VAES, X86_FEATURE_AVX }, { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1b..dfec2c61e3547 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index cacfd3f6abeff..1976fef2dfe51 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f21..6af4a4a90a521 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3dce22f00dc34..291c82816797e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,40 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include #include -#include -#include -#include -#include #include -#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif -#include -#include #include +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include #include +#include #include - -#ifdef CONFIG_X86_64 -#include -#endif +#include #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#endif - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -635,70 +626,90 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 -#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G_2M_4M 0x17 -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of + * entries for their respective TLB types. The 0x63 descriptor is an + * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries + * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for + * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the + * intel_tlb_table[] mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 + +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; +}; static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ + { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ + { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ + { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ + { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ + { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ + { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ + { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ + { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ + { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ + { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ { 0x00, 0, 0 } }; static void intel_tlb_lookup(const unsigned char desc) { + unsigned int entries; unsigned char k; + if (desc == 0) return; @@ -710,75 +721,58 @@ static void intel_tlb_lookup(const unsigned char desc) if (intel_tlb_table[k].tlb_type == 0) return; + entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; + case TLB_DATA_1G_2M_4M: + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); + fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } @@ -799,7 +793,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; @@ -873,34 +867,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} - -/** - * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU - * - * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. - * If the processor is not hybrid, returns 0. - */ -u32 get_this_hybrid_cpu_native_id(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) & - (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); -} diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0dc00c9894c7c..1f14c3308b6b4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -584,6 +584,28 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +static bool mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return true; + } + + return false; +} + static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -1773,28 +1795,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -bool mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return true; - } - return false; -} -EXPORT_SYMBOL_GPL(mce_notify_irq); - static void __mcheck_cpu_mce_banks_init(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 313fe682db339..06e3cf7229ce9 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -229,7 +229,6 @@ static int raise_local(void) } else if (m->status) { pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); - mce_notify_irq(); pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a5dac7f3c0a07..138689b8e1d83 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -23,14 +23,18 @@ #include #include +#include #include #include #include #include #include +#include + #include #include +#include #include #include #include @@ -145,6 +149,113 @@ ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; */ static u32 bsp_cpuid_1_eax __ro_after_init; +static bool sha_check = true; + +struct patch_digest { + u32 patch_id; + u8 sha256[SHA256_DIGEST_SIZE]; +}; + +#include "amd_shas.c" + +static int cmp_id(const void *key, const void *elem) +{ + struct patch_digest *pd = (struct patch_digest *)elem; + u32 patch_id = *(u32 *)key; + + if (patch_id == pd->patch_id) + return 0; + else if (patch_id < pd->patch_id) + return -1; + else + return 1; +} + +static bool need_sha_check(u32 cur_rev) +{ + switch (cur_rev >> 8) { + case 0x80012: return cur_rev <= 0x800126f; break; + case 0x80082: return cur_rev <= 0x800820f; break; + case 0x83010: return cur_rev <= 0x830107c; break; + case 0x86001: return cur_rev <= 0x860010e; break; + case 0x86081: return cur_rev <= 0x8608108; break; + case 0x87010: return cur_rev <= 0x8701034; break; + case 0x8a000: return cur_rev <= 0x8a0000a; break; + case 0xa0010: return cur_rev <= 0xa00107a; break; + case 0xa0011: return cur_rev <= 0xa0011da; break; + case 0xa0012: return cur_rev <= 0xa001243; break; + case 0xa0082: return cur_rev <= 0xa00820e; break; + case 0xa1011: return cur_rev <= 0xa101153; break; + case 0xa1012: return cur_rev <= 0xa10124e; break; + case 0xa1081: return cur_rev <= 0xa108109; break; + case 0xa2010: return cur_rev <= 0xa20102f; break; + case 0xa2012: return cur_rev <= 0xa201212; break; + case 0xa4041: return cur_rev <= 0xa404109; break; + case 0xa5000: return cur_rev <= 0xa500013; break; + case 0xa6012: return cur_rev <= 0xa60120a; break; + case 0xa7041: return cur_rev <= 0xa704109; break; + case 0xa7052: return cur_rev <= 0xa705208; break; + case 0xa7080: return cur_rev <= 0xa708009; break; + case 0xa70c0: return cur_rev <= 0xa70C009; break; + case 0xaa001: return cur_rev <= 0xaa00116; break; + case 0xaa002: return cur_rev <= 0xaa00218; break; + default: break; + } + + pr_info("You should not be seeing this. Please send the following couple of lines to x86--kernel.org\n"); + pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); + return true; +} + +static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len) +{ + struct patch_digest *pd = NULL; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state s; + int i; + + if (x86_family(bsp_cpuid_1_eax) < 0x17 || + x86_family(bsp_cpuid_1_eax) > 0x19) + return true; + + if (!need_sha_check(cur_rev)) + return true; + + if (!sha_check) + return true; + + pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id); + if (!pd) { + pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id); + return false; + } + + sha256_init(&s); + sha256_update(&s, data, len); + sha256_final(&s, digest); + + if (memcmp(digest, pd->sha256, sizeof(digest))) { + pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); + + for (i = 0; i < SHA256_DIGEST_SIZE; i++) + pr_cont("0x%x ", digest[i]); + pr_info("\n"); + + return false; + } + + return true; +} + +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -246,8 +357,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) * On success, @sh_psize returns the patch size according to the section header, * to the caller. */ -static bool -__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) +static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) { u32 p_type, p_size; const u32 *hdr; @@ -484,10 +594,13 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) } } -static bool __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize) +static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, + unsigned int psize) { unsigned long p_addr = (unsigned long)&mc->hdr.data_code; - u32 rev, dummy; + + if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) + return -1; native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); @@ -505,47 +618,13 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, unsigned int psize) } /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - if (rev != mc->hdr.patch_id) + *cur_rev = get_patch_level(); + if (*cur_rev != mc->hdr.patch_id) return false; return true; } -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - * - * Returns true if container found (sets @desc), false otherwise. - */ -static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) -{ - struct cont_desc desc = { 0 }; - struct microcode_amd *mc; - - scan_containers(ucode, size, &desc); - - mc = desc.mc; - if (!mc) - return false; - - /* - * Allow application of the same revision to pick up SMT-specific - * changes even if the revision of the other SMT thread is already - * up-to-date. - */ - if (old_rev > mc->hdr.patch_id) - return false; - - return __apply_microcode_amd(mc, desc.psize); -} - static bool get_builtin_microcode(struct cpio_data *cp) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -583,14 +662,35 @@ static bool __init find_blobs_in_containers(struct cpio_data *ret) return found; } +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { + struct cont_desc desc = { }; + struct microcode_amd *mc; struct cpio_data cp = { }; - u32 dummy; + char buf[4]; + u32 rev; + + if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) { + if (!strncmp(buf, "off", 3)) { + sha_check = false; + pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + } bsp_cpuid_1_eax = cpuid_1_eax; - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); + rev = get_patch_level(); + ed->old_rev = rev; /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; @@ -598,37 +698,23 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ if (!find_blobs_in_containers(&cp)) return; - if (early_apply_microcode(ed->old_rev, cp.data, cp.size)) - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); -} - -static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size); - -static int __init save_microcode_in_initrd(void) -{ - unsigned int cpuid_1_eax = native_cpuid_eax(1); - struct cpuinfo_x86 *c = &boot_cpu_data; - struct cont_desc desc = { 0 }; - enum ucode_state ret; - struct cpio_data cp; - - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) - return 0; - - if (!find_blobs_in_containers(&cp)) - return -EINVAL; - scan_containers(cp.data, cp.size, &desc); - if (!desc.mc) - return -EINVAL; - ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret > UCODE_UPDATED) - return -EINVAL; + mc = desc.mc; + if (!mc) + return; - return 0; + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (ed->old_rev > mc->hdr.patch_id) + return; + + if (__apply_microcode_amd(mc, &rev, desc.psize)) + ed->new_rev = rev; } -early_initcall(save_microcode_in_initrd); static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n, @@ -729,14 +815,9 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u32 rev, dummy __always_unused; u16 equiv_id = 0; - /* fetch rev if not populated yet: */ - if (!uci->cpu_sig.rev) { - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - uci->cpu_sig.rev = rev; - } + uci->cpu_sig.rev = get_patch_level(); if (x86_family(bsp_cpuid_1_eax) < 0x17) { equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); @@ -759,22 +840,20 @@ void reload_ucode_amd(unsigned int cpu) mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - + rev = get_patch_level(); if (rev < mc->hdr.patch_id) { - if (__apply_microcode_amd(mc, p->size)) - pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); + if (__apply_microcode_amd(mc, &rev, p->size)) + pr_info_once("reload revision: 0x%08x\n", rev); } } static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct ucode_patch *p; csig->sig = cpuid_eax(0x00000001); - csig->rev = c->microcode; + csig->rev = get_patch_level(); /* * a patch could have been loaded early, set uci->mc so that @@ -815,7 +894,7 @@ static enum ucode_state apply_microcode_amd(int cpu) goto out; } - if (!__apply_microcode_amd(mc_amd, p->size)) { + if (!__apply_microcode_amd(mc_amd, &rev, p->size)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; @@ -937,8 +1016,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, } /* Scan the blob in @data and add microcode patches to the cache. */ -static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, - size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { u8 *fw = (u8 *)data; size_t offset; @@ -996,7 +1074,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz if (ret != UCODE_OK) return ret; - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); @@ -1013,6 +1091,32 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } +static int __init save_microcode_in_initrd(void) +{ + unsigned int cpuid_1_eax = native_cpuid_eax(1); + struct cpuinfo_x86 *c = &boot_cpu_data; + struct cont_desc desc = { 0 }; + enum ucode_state ret; + struct cpio_data cp; + + if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + return 0; + + if (!find_blobs_in_containers(&cp)) + return -EINVAL; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} +early_initcall(save_microcode_in_initrd); + /* * AMD microcode firmware naming convention, up to family 15h they are in * the legacy file: diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c new file mode 100644 index 0000000000000..2a1655b1fdd88 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_shas.c @@ -0,0 +1,444 @@ +/* Keep 'em sorted. */ +static const struct patch_digest phashes[] = { + { 0x8001227, { + 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b, + 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46, + 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8, + 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18, + } + }, + { 0x8001250, { + 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60, + 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa, + 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3, + 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19, + } + }, + { 0x800126e, { + 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c, + 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3, + 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6, + 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43, + } + }, + { 0x800126f, { + 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec, + 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b, + 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18, + 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33, + } + }, + { 0x800820d, { + 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59, + 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67, + 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c, + 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e, + } + }, + { 0x8301025, { + 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36, + 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1, + 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30, + 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77, + } + }, + { 0x8301055, { + 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a, + 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea, + 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b, + 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b, + } + }, + { 0x8301072, { + 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e, + 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28, + 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1, + 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30, + } + }, + { 0x830107a, { + 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72, + 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34, + 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20, + 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a, + } + }, + { 0x830107b, { + 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad, + 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a, + 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04, + 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19, + } + }, + { 0x830107c, { + 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47, + 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac, + 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3, + 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38, + } + }, + { 0x860010d, { + 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0, + 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7, + 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c, + 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8, + } + }, + { 0x8608108, { + 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2, + 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38, + 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc, + 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd, + } + }, + { 0x8701034, { + 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83, + 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d, + 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8, + 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b, + } + }, + { 0x8a00008, { + 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e, + 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e, + 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72, + 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c, + } + }, + { 0x8a0000a, { + 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c, + 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44, + 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb, + 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20, + } + }, + { 0xa00104c, { + 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe, + 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76, + 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b, + 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b, + } + }, + { 0xa00104e, { + 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2, + 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0, + 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e, + 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b, + } + }, + { 0xa001053, { + 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d, + 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25, + 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb, + 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3, + } + }, + { 0xa001058, { + 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36, + 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19, + 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec, + 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2, + } + }, + { 0xa001075, { + 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9, + 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a, + 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f, + 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0, + } + }, + { 0xa001078, { + 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25, + 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce, + 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04, + 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e, + } + }, + { 0xa001079, { + 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb, + 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93, + 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb, + 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a, + } + }, + { 0xa00107a, { + 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f, + 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd, + 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f, + 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18, + } + }, + { 0xa001143, { + 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80, + 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42, + 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d, + 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8, + } + }, + { 0xa001144, { + 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41, + 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b, + 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25, + 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc, + } + }, + { 0xa00115d, { + 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd, + 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75, + 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e, + 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05, + } + }, + { 0xa001173, { + 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a, + 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35, + 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3, + 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e, + } + }, + { 0xa0011a8, { + 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b, + 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6, + 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4, + 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e, + } + }, + { 0xa0011ce, { + 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71, + 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86, + 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e, + 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a, + } + }, + { 0xa0011d1, { + 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e, + 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09, + 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5, + 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8, + } + }, + { 0xa0011d3, { + 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b, + 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6, + 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00, + 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26, + } + }, + { 0xa0011d5, { + 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13, + 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7, + 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62, + 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21, + } + }, + { 0xa001223, { + 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8, + 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4, + 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15, + 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45, + } + }, + { 0xa001224, { + 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25, + 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad, + 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9, + 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a, + } + }, + { 0xa001227, { + 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad, + 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d, + 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9, + 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0, + } + }, + { 0xa001229, { + 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6, + 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75, + 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4, + 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d, + } + }, + { 0xa00122e, { + 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf, + 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15, + 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa, + 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10, + } + }, + { 0xa001231, { + 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e, + 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64, + 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b, + 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd, + } + }, + { 0xa001234, { + 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7, + 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc, + 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b, + 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a, + } + }, + { 0xa001236, { + 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78, + 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d, + 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b, + 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25, + } + }, + { 0xa001238, { + 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc, + 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a, + 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e, + 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59, + } + }, + { 0xa00820c, { + 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3, + 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63, + 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1, + 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2, + } + }, + { 0xa10113e, { + 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10, + 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0, + 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5, + 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53, + } + }, + { 0xa101144, { + 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26, + 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09, + 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc, + 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47, + } + }, + { 0xa101148, { + 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90, + 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f, + 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08, + 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4, + } + }, + { 0xa10123e, { + 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18, + 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d, + 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc, + 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b, + } + }, + { 0xa101244, { + 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c, + 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1, + 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72, + 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0, + } + }, + { 0xa101248, { + 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e, + 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22, + 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e, + 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75, + } + }, + { 0xa108108, { + 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9, + 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6, + 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c, + 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16, + } + }, + { 0xa20102d, { + 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11, + 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89, + 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23, + 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4, + } + }, + { 0xa201210, { + 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe, + 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9, + 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68, + 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41, + } + }, + { 0xa404107, { + 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45, + 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0, + 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81, + 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99, + } + }, + { 0xa500011, { + 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4, + 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1, + 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2, + 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74, + } + }, + { 0xa601209, { + 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32, + 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30, + 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46, + 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d, + } + }, + { 0xa704107, { + 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6, + 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93, + 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2, + 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39, + } + }, + { 0xa705206, { + 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4, + 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7, + 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b, + 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc, + } + }, + { 0xa708007, { + 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3, + 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2, + 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80, + 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93, + } + }, + { 0xa70c005, { + 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b, + 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f, + 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55, + 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13, + } + }, + { 0xaa00116, { + 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63, + 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5, + 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d, + 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9, + } + }, + { 0xaa00212, { + 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75, + 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71, + 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2, + 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1, + } + }, + { 0xaa00213, { + 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a, + 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f, + 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed, + 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b, + } + }, + { 0xaa00215, { + 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9, + 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4, + 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b, + 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef, + } + }, +}; diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 21776c529fa97..5df621752fefa 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -100,14 +100,12 @@ extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); -int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } -static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } static inline void exit_amd_microcode(void) { } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 2fdfda2b60e4f..6be3cade4134c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -646,10 +647,10 @@ static void __init print_mtrr_state(void) pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_info("MTRR fixed ranges %sabled:\n", - ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? - "en" : "dis"); + pr_info("MTRR fixed ranges %s:\n", + str_enabled_disabled( + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED))); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -661,8 +662,8 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_info("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + pr_info("MTRR variable ranges %s:\n", + str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7fa..4049235b1bfe4 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 22b65a5f5ec6c..7f8d1e11dbee2 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -150,13 +150,15 @@ int __init sgx_drv_init(void) u64 xfrm_mask; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; + } cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); if (!(eax & 1)) { - pr_err("SGX disabled: SGX1 instruction support not available.\n"); + pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; } @@ -173,8 +175,10 @@ int __init sgx_drv_init(void) } ret = misc_register(&sgx_dev_enclave); - if (ret) + if (ret) { + pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret); return ret; + } return 0; } diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index b65ab214bdf57..776a20172867e 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -64,6 +64,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; + /* + * ECREATE would detect this too, but checking here also ensures + * that the 'encl_size' calculations below can never overflow. + */ + if (!is_power_of_2(secs->size)) + return -EINVAL; + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 340af81556584..0be61c45400cd 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -140,7 +140,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) x86_platform.guest.enc_kexec_begin(); x86_platform.guest.enc_kexec_finish(); - crash_save_cpu(regs, safe_smp_processor_id()); + crash_save_cpu(regs, smp_processor_id()); } #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 59d23cdf4ed0f..dd8748c45529a 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ /* * Architecture specific OF callbacks. */ +#include #include #include #include @@ -313,6 +314,6 @@ void __init x86_flattree_get_config(void) if (initial_dtb) early_memunmap(dt, map_len); #endif - if (of_have_populated_dt()) + if (acpi_disabled && of_have_populated_dt()) x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b4905d5173fd3..722fd712e1cf0 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f05339fee7785..6c5defd6569a3 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82b96ed9890ab..57120f0749cc3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -28,18 +28,13 @@ * the first 128 E820 memory entries in boot_params.e820_table and the remaining * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * - * - inform the user about the firmware's notion of memory layout - * via /sys/firmware/memmap - * * - the hibernation code uses it to generate a kernel-independent CRC32 * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between - * e820_table_firmware[] and this one is that, the latter marks the setup_data - * list created by the EFI boot stub as reserved, so that kexec can reuse the - * setup_data information in the second kernel. Besides, e820_table_kexec[] - * might also be modified by the kexec itself to fake a mptable. + * e820_table_firmware[] and this one is that e820_table_kexec[] + * might be modified by the kexec itself to fake an mptable. * We use this to: * * - kexec, which is a bootloader in disguise, uses the original E820 @@ -47,6 +42,11 @@ * can have a restricted E820 map while the kexec()-ed kexec-kernel * can have access to full memory - etc. * + * Export the memory layout via /sys/firmware/memmap. kexec-tools uses + * the entries to create an E820 table for the kexec kernel. + * + * kexec_file_load in-kernel code uses the table for the kexec kernel. + * * - 'e820_table': this is the main E820 table that is massaged by the * low level x86 platform code, or modified by boot parameters, before * passed on to higher level MM layers. @@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: /* Fall through: */ - case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RAM: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; @@ -764,7 +763,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) @@ -990,60 +989,6 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -/* - * Reserve all entries from the bootloader's extensible data nodes list, - * because if present we are going to use it later on to fetch e820 - * entries from it: - */ -void __init e820__reserve_setup_data(void) -{ - struct setup_indirect *indirect; - struct setup_data *data; - u64 pa_data, pa_next; - u32 len; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - if (!data) { - pr_warn("e820: failed to memremap setup_data entry\n"); - return; - } - - len = sizeof(*data); - pa_next = data->next; - - e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - - if (data->type == SETUP_INDIRECT) { - len += data->len; - early_memunmap(data, sizeof(*data)); - data = early_memremap(pa_data, len); - if (!data) { - pr_warn("e820: failed to memremap indirect setup_data\n"); - return; - } - - indirect = (struct setup_indirect *)data->data; - - if (indirect->type != SETUP_INDIRECT) - e820__range_update(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } - - pa_data = pa_next; - early_memunmap(data, len); - } - - e820__update_table(e820_table); - - pr_info("extended physical RAM map:\n"); - e820__print_table("reserve setup_data"); -} - /* * Called after parse_early_param(), after early parameters (such as mem=) * have been processed, in which case we already have an E820 table filled in @@ -1063,7 +1008,6 @@ void __init e820__finish_early_params(void) static const char *__init e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return "System RAM"; case E820_TYPE_ACPI: return "ACPI Tables"; case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; @@ -1079,7 +1023,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; case E820_TYPE_ACPI: /* Fall-through: */ case E820_TYPE_NVS: /* Fall-through: */ @@ -1101,7 +1044,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ default: return IORES_DESC_NONE; @@ -1124,7 +1066,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; - case E820_TYPE_RESERVED_KERN: case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: @@ -1176,9 +1117,9 @@ void __init e820__reserve_resources(void) res++; } - /* Expose the bootloader-provided memory layout to the sysfs. */ - for (i = 0; i < e820_table_firmware->nr_entries; i++) { - struct e820_entry *entry = e820_table_firmware->entries + i; + /* Expose the kexec e820 table to the sysfs. */ + for (i = 0; i < e820_table_kexec->nr_entries; i++) { + struct e820_entry *entry = e820_table_kexec->entries + i; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } @@ -1302,6 +1243,36 @@ void __init e820__memblock_setup(void) int i; u64 end; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + /* + * At this point only the first megabyte is mapped for sure, the + * rest of the memory cannot be used for memblock resizing + */ + memblock_set_current_limit(ISA_END_ADDRESS); + /* * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries @@ -1323,7 +1294,7 @@ void __init e820__memblock_setup(void) if (entry->type == E820_TYPE_SOFT_RESERVED) memblock_reserve(entry->addr, entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) continue; memblock_add(entry->addr, entry->size); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 44f937015e1e2..fc1714bad0458 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -94,26 +95,28 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ -static unsigned int io_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int io_serial_in(unsigned long addr, int offset) { return inb(addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_in); -static void io_serial_out(unsigned long addr, int offset, int value) +static __noendbr void io_serial_out(unsigned long addr, int offset, int value) { outb(value, addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_out); -static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; -static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; +DEFINE_STATIC_CALL(serial_in, io_serial_in); +DEFINE_STATIC_CALL(serial_out, io_serial_out); static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) + while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - serial_out(early_serial_base, TXR, ch); + static_call(serial_out)(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -131,16 +134,16 @@ static __init void early_serial_hw_init(unsigned divisor) { unsigned char c; - serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ - serial_out(early_serial_base, IER, 0); /* no interrupt */ - serial_out(early_serial_base, FCR, 0); /* no fifo */ - serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */ + static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */ + static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */ + static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */ - c = serial_in(early_serial_base, LCR); - serial_out(early_serial_base, LCR, c | DLAB); - serial_out(early_serial_base, DLL, divisor & 0xff); - serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); - serial_out(early_serial_base, LCR, c & ~DLAB); + c = static_call(serial_in)(early_serial_base, LCR); + static_call(serial_out)(early_serial_base, LCR, c | DLAB); + static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); + static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); + static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); } #define DEFAULT_BAUD 9600 @@ -183,28 +186,26 @@ static __init void early_serial_init(char *s) /* Convert from baud to divisor value */ divisor = 115200 / baud; - /* These will always be IO based ports */ - serial_in = io_serial_in; - serial_out = io_serial_out; - /* Set up the HW */ early_serial_hw_init(divisor); } #ifdef CONFIG_PCI -static void mem32_serial_out(unsigned long addr, int offset, int value) +static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ writel(value, vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_out); -static unsigned int mem32_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ return readl(vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_in); /* * early_pci_serial_init() @@ -278,15 +279,13 @@ static __init void early_pci_serial_init(char *s) */ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ - serial_in = io_serial_in; - serial_out = io_serial_out; early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ - serial_in = mem32_serial_in; - serial_out = mem32_serial_out; + static_call_update(serial_in, mem32_serial_in); + static_call_update(serial_out, mem32_serial_out); /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1209c7aebb211..422c98ca6eb86 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -60,9 +60,16 @@ bool irq_fpu_usable(void) if (WARN_ON_ONCE(in_nmi())) return false; - /* In kernel FPU usage already active? */ - if (this_cpu_read(in_kernel_fpu)) + /* + * In kernel FPU usage already active? This detects any explicitly + * nested usage in task or softirq context, which is unsupported. It + * also detects attempted usage in a hardirq that has interrupted a + * kernel-mode FPU section. + */ + if (this_cpu_read(in_kernel_fpu)) { + WARN_ON_FPU(!in_hardirq()); return false; + } /* * When not in NMI or hard interrupt context, FPU can be used in: @@ -220,7 +227,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) struct fpstate *fpstate; unsigned int size; - size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; @@ -420,7 +427,8 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { - preempt_disable(); + if (!irqs_disabled()) + fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); @@ -448,7 +456,8 @@ void kernel_fpu_end(void) WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); - preempt_enable(); + if (!irqs_disabled()) + fpregs_unlock(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index dbdb31f55fc7f..975de070c9c98 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void) #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8f62e0666dea5..6c69cb28b2983 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -27,19 +27,14 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { - int min_xstate_size = sizeof(struct fxregs_state) + - sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || - fx_sw->xstate_size > fx_sw->extended_size) + /* Check for the first magic field */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1) goto setfx; /* @@ -48,7 +43,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 27417b685c1db..6a41d1610d8bc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -259,32 +259,20 @@ static void __init setup_xstate_cache(void) } } -static void __init print_xstate_feature(u64 xstate_mask) -{ - const char *feature_name; - - if (cpu_has_xfeatures(xstate_mask, &feature_name)) - pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); -} - /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { - print_xstate_feature(XFEATURE_MASK_FP); - print_xstate_feature(XFEATURE_MASK_SSE); - print_xstate_feature(XFEATURE_MASK_YMM); - print_xstate_feature(XFEATURE_MASK_BNDREGS); - print_xstate_feature(XFEATURE_MASK_BNDCSR); - print_xstate_feature(XFEATURE_MASK_OPMASK); - print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); - print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); - print_xstate_feature(XFEATURE_MASK_PKRU); - print_xstate_feature(XFEATURE_MASK_PASID); - print_xstate_feature(XFEATURE_MASK_CET_USER); - print_xstate_feature(XFEATURE_MASK_XTILE_CFG); - print_xstate_feature(XFEATURE_MASK_XTILE_DATA); + int i; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = BIT_ULL(i); + const char *name; + + if (cpu_has_xfeatures(mask, &name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); + } } /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 166bc0ea3bdff..cace6e8d7cc77 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) { + if (ftrace_poke_late) text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - } else { - mutex_lock(&text_mutex); - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); - mutex_unlock(&text_mutex); - } + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } @@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - void *ret; + int ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = text_poke_copy(trampoline, (void *)start_offset, size); - if (WARN_ON(!ret)) + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - text_poke_copy(ip, retq, sizeof(retq)); + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - if (!text_poke_copy(ip, x86_nops[2], 2)) + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) goto fail; } @@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - text_poke_copy(ptr, &ops, sizeof(unsigned long)); + *ptr = (unsigned long)ops; op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - text_poke_copy_locked(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE, false); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index d516472285967..367da36381678 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -146,12 +146,14 @@ SYM_FUNC_END(ftrace_stub_graph) #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) SYM_FUNC_START(ftrace_caller) + ANNOTATE_NOENDBR /* save_mcount_regs fills in first two parameters */ save_mcount_regs @@ -197,6 +199,7 @@ SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_regs_caller) + ANNOTATE_NOENDBR /* Save the current flags before any operations that can change them */ pushfq @@ -310,6 +313,7 @@ SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(ftrace_stub_direct_tramp) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub_direct_tramp) @@ -317,6 +321,7 @@ SYM_FUNC_END(ftrace_stub_direct_tramp) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT cmpq $ftrace_stub, ftrace_trace_function diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 22c9ba305ac17..05f8b8acf7845 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -567,7 +567,7 @@ void early_setup_idt(void) */ void __head startup_64_setup_gdt_idt(void) { - struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt); + struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; void *handler = NULL; struct desc_ptr startup_gdt_descr = { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 31345e0ba0064..fefe2a25cf025 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu() */ leaq __top_init_kernel_stack(%rip), %rsp - /* Setup GSBASE to allow stack canary access for C code */ + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ movl $MSR_GS_BASE, %ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movl %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr call startup_64_setup_gdt_idt @@ -319,7 +322,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) * * RDX contains the per-cpu offset */ - movq pcpu_hot + X86_current_task(%rdx), %rax + movq current_task(%rdx), %rax movq TASK_threadsp(%rax), %rsp /* @@ -359,17 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) movl %eax,%fs movl %eax,%gs - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx -#ifndef CONFIG_SMP - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx -#endif movl %edx, %eax shrq $32, %rdx wrmsr @@ -435,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx + movq PER_CPU_VAR(current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index c20d1832c4812..2bade73f49e33 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -23,6 +23,7 @@ #include #include #include +#include /* * This is the 'legacy' 8259A Programmable Interrupt Controller, diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index e2fab3ceb09fb..6290dd120f5e4 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -144,7 +144,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * Update the sequence number to force a TSS update on return to * user mode. */ - iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + iobm->sequence = atomic64_inc_return(&io_bitmap_sequence); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 385e3a5fc3045..81f9b78e0f7ba 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -25,12 +25,19 @@ #include #include +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR) #define CREATE_TRACE_POINTS #include +#endif DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +EXPORT_PER_CPU_SYMBOL(__softirq_pending); + +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); + atomic_t irq_err_count; /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index dc1049c01f9b9..c7a5d2960d575 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -29,12 +29,9 @@ int sysctl_panic_on_stackoverflow __read_mostly; /* Debugging check for stack overflow: is there less than 1KB free? */ -static int check_stack_overflow(void) +static bool check_stack_overflow(void) { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); + unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1); return sp < (sizeof(struct thread_info) + STACK_WARN); } @@ -48,18 +45,19 @@ static void print_stack_overflow(void) } #else -static inline int check_stack_overflow(void) { return 0; } +static inline bool check_stack_overflow(void) { return false; } static inline void print_stack_overflow(void) { } #endif +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); + static void call_on_stack(void *func, void *stack) { - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=b" (stack) - : "0" (stack), - [thunk_target] "D"(func) + "movl %[sp], %%esp" + : [sp] "+b" (stack) + : [thunk_target] "D" (func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -68,13 +66,13 @@ static inline void *current_stack(void) return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } -static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; - u32 *isp, *prev_esp, arg1; + u32 *isp, *prev_esp; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -83,7 +81,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) * current stack (which is the irq stack already after all) */ if (unlikely(curstk == irqstk)) - return 0; + return false; isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -94,14 +92,13 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=a" (arg1), "=b" (isp) - : "0" (desc), "1" (isp), - [thunk_target] "D" (desc->handle_irq) - : "memory", "cc", "ecx"); - return 1; + "movl %[sp], %%esp" + : "+a" (desc), [sp] "+b" (isp) + : [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "edx", "ecx"); + return true; } /* @@ -112,7 +109,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -124,8 +121,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -135,7 +132,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -150,7 +147,7 @@ void do_softirq_own_stack(void) void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - int overflow = check_stack_overflow(); + bool overflow = check_stack_overflow(); if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ade0043ce56ef..ca78dce39361f 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,8 +26,8 @@ #include #include +DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; -DECLARE_INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_VMAP_STACK /* @@ -51,7 +51,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -64,14 +64,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 7f542a7799cb7..fdabd5dda154a 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -9,6 +9,7 @@ */ .pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) + ENDBR pushf pop %_ASM_AX RET diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 72e6a45e7ec24..09608fd936876 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,16 +373,7 @@ static bool can_probe(unsigned long paddr) kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - u32 insn; - - /* - * Since 'addr' is not guaranteed to be safe to access, use - * copy_from_kernel_nofault() to read the instruction: - */ - if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) - return NULL; - - if (is_endbr(insn)) { + if (is_endbr((u32 *)addr)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7a422a6c5983c..3be9b3342c672 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c001..a7998f3517017 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -130,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, goto overflow; size = 4; break; +#if defined(CONFIG_STACKPROTECTOR) && \ + defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 + case R_X86_64_REX_GOTPCRELX: { + static unsigned long __percpu *const addr = &__stack_chk_guard; + + if (sym->st_value != (u64)addr) { + pr_err("%s: Unsupported GOTPCREL relocation\n", me->name); + return -ENOEXEC; + } + + val = (u64)&addr + rel[i].r_addend; + fallthrough; + } +#endif case R_X86_64_PC32: case R_X86_64_PLT32: val -= (u64)loc; @@ -146,21 +161,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - void *wr_loc = module_writable_address(me, loc); - - if (memcmp(wr_loc, &zero, size)) { + if (memcmp(loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(wr_loc, &val, size); + write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -227,7 +239,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -236,6 +248,8 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -266,60 +280,33 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size, me); + apply_retpolines(rseg, rseg + retpolines->sh_size); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size, me); - } - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size, me); + apply_returns(rseg, rseg + returns->sh_size); } - if (calls || alt) { + if (calls) { struct callthunk_sites cs = {}; - if (calls) { - cs.call_start = (void *)calls->sh_addr; - cs.call_end = (void *)calls->sh_addr + calls->sh_size; - } - - if (alt) { - cs.alt_start = (void *)alt->sh_addr; - cs.alt_end = (void *)alt->sh_addr + alt->sh_size; - } + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; callthunks_patch_module_calls(&cs, me); } + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } - - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - - return 0; -} - -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - } - if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -329,6 +316,10 @@ int module_post_finalize(const Elf_Ehdr *hdr, text, text_end); } + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ed163c8c8604e..9a95d00f14233 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -40,8 +40,12 @@ #define CREATE_TRACE_POINTS #include +/* + * An emergency handler can be set in any context including NMI + */ struct nmi_desc { raw_spinlock_t lock; + nmi_handler_t emerg_handler; struct list_head head; }; @@ -132,9 +136,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); + nmi_handler_t ehandler; struct nmiaction *a; int handled=0; + /* + * Call the emergency handler, if set + * + * In the case of crash_nmi_callback() emergency handler, it will + * return in the case of the crashing CPU to enable it to complete + * other necessary crashing actions ASAP. Other handlers in the + * linked list won't need to be run. + */ + ehandler = desc->emerg_handler; + if (ehandler) + return ehandler(type, regs); + rcu_read_lock(); /* @@ -224,6 +241,31 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); +/** + * set_emergency_nmi_handler - Set emergency handler + * @type: NMI type + * @handler: the emergency handler to be stored + * + * Set an emergency NMI handler which, if set, will preempt all the other + * handlers in the linked list. If a NULL handler is passed in, it will clear + * it. It is expected that concurrent calls to this function will not happen + * or the system is screwed beyond repair. + */ +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler) +{ + struct nmi_desc *desc = nmi_to_desc(type); + + if (WARN_ON_ONCE(desc->emerg_handler == handler)) + return; + desc->emerg_handler = handler; + + /* + * Ensure the emergency handler is visible to other CPUs before + * function return + */ + smp_wmb(); +} + static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccaa3397a670..97925632c28ee 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,21 +59,6 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } -#ifndef CONFIG_PT_RECLAIM -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -90,30 +75,20 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } -/* These are in entry.S */ -static struct resource reserve_ioports = { - .start = 0, - .end = IO_SPACE_LIMIT, - .name = "paravirt-ioport", - .flags = IORESOURCE_IO | IORESOURCE_BUSY, -}; +#ifdef CONFIG_PARAVIRT_XXL +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} -/* - * Reserve the whole legacy IO space to prevent any legacy drivers - * from wasting time probing for their hardware. This is a fairly - * brute-force approach to disabling all non-virtual drivers. - * - * Note that this must be called very early to have any effect. - */ -int paravirt_disable_iospace(void) +static noinstr unsigned long pv_native_read_cr3(void) { - return request_resource(&ioport_resource, &reserve_ioports); + return __native_read_cr3(); } -#ifdef CONFIG_PARAVIRT_XXL -static noinstr void pv_native_write_cr2(unsigned long val) +static noinstr void pv_native_write_cr3(unsigned long cr3) { - native_write_cr2(val); + native_write_cr3(cr3); } static noinstr unsigned long pv_native_get_debugreg(int regno) @@ -195,7 +170,6 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, @@ -203,8 +177,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, - .mmu.read_cr3 = __native_read_cr3, - .mmu.write_cr3 = native_write_cr3, + .mmu.read_cr3 = pv_native_read_cr3, + .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6da6769d7254a..9c75d701011fd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -93,7 +93,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* init_task is not dynamically sized (incomplete FPU state) */ + if (unlikely(src == &init_task)) + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0); + else + memcpy(dst, src, arch_task_struct_size); + #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif @@ -1043,7 +1048,7 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(int option, unsigned long arg2) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { switch (option) { case ARCH_GET_CPUID: @@ -1058,5 +1063,8 @@ long do_arch_prctl_common(int option, unsigned long arg2) return fpu_xstate_prctl(option, arg2); } + if (!in_ia32_syscall()) + return do_arch_prctl_64(current, option, arg2); + return -EINVAL; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0917c7f25720b..4636ef3599737 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,13 +190,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and pcpu_hot.top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(pcpu_hot.top_of_stack, + this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -206,7 +206,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - raw_cpu_write(pcpu_hot.current_task, next_p); + raw_cpu_write(current_task, next_p); switch_fpu_finish(next_p); @@ -215,8 +215,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 226472332a70d..7196ca7048be0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -614,7 +614,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(pcpu_hot.hardirq_stack_inuse)); + this_cpu_read(hardirq_stack_inuse)); if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_p, cpu); @@ -668,8 +668,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - raw_cpu_write(pcpu_hot.current_task, next_p); - raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(current_task, next_p); + raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(next_p); @@ -942,7 +942,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, arg2); # endif -# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +# ifdef CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, arg2); # endif @@ -979,26 +979,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return ret; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - long ret; - - ret = do_arch_prctl_64(current, option, arg2); - if (ret == -EINVAL) - ret = do_arch_prctl_common(option, arg2); - - return ret; -} - -#ifdef CONFIG_IA32_EMULATION -COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} -#endif - -unsigned long KSTK_ESP(struct task_struct *task) -{ - return task_pt_regs(task)->sp; -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6d0df6a58873d..a92f18db96100 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -10,6 +10,8 @@ #include #include +#include + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void quirk_intel_irqbalance(struct pci_dev *dev) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index dc1dd3f3e67fc..964f6b0a3d68c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -921,20 +921,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) return; /* Make a note of crashing cpu. Will be used in NMI callback. */ - crashing_cpu = safe_smp_processor_id(); + crashing_cpu = smp_processor_id(); shootdown_callback = callback; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, - NMI_FLAG_FIRST, "crash")) - return; /* Return what? */ + /* - * Ensure the new callback function is set before sending - * out the NMI + * Set emergency handler to preempt other handlers. */ - wmb(); + set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback); apic_send_IPI_allbutself(NMI_VECTOR); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cebee310e2009..c7164a8de983c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -56,6 +56,9 @@ #include #include #include +#if defined(CONFIG_X86_LOCAL_APIC) +#include +#endif /* * max_low_pfn_mapped: highest directly mapped pfn < 4 GB @@ -146,6 +149,69 @@ static size_t ima_kexec_buffer_size; /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; +static const struct ctl_table x86_sysctl_table[] = { + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_X86_LOCAL_APIC) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +}; + +static int __init init_x86_sysctl(void) +{ + register_sysctl_init("kernel", x86_sysctl_table); + return 0; +} +arch_initcall(init_x86_sysctl); + /* * Setup options */ @@ -429,6 +495,46 @@ static void __init parse_setup_data(void) } } +/* + * Translate the fields of 'struct boot_param' into global variables + * representing these parameters. + */ +static void __init parse_boot_params(void) +{ + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; +} + static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_indirect *indirect; @@ -527,6 +633,23 @@ void __init reserve_standard_io_resources(void) } +static void __init setup_kernel_resources(void) +{ + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); +} + static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -789,35 +912,7 @@ void __init setup_arch(char **cmdline_p) setup_olpc_ofw_pgd(); - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - if ((bootloader_type >> 4) == 0xe) { - bootloader_type &= 0xf; - bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; - } - bootloader_version = bootloader_type & 0xf; - bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI32_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI64_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - } -#endif + parse_boot_params(); x86_init.oem.arch_setup(); @@ -841,19 +936,8 @@ void __init setup_arch(char **cmdline_p) copy_edd(); - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); - code_resource.start = __pa_symbol(_text); - code_resource.end = __pa_symbol(_etext)-1; - rodata_resource.start = __pa_symbol(__start_rodata); - rodata_resource.end = __pa_symbol(__end_rodata)-1; - data_resource.start = __pa_symbol(_sdata); - data_resource.end = __pa_symbol(_edata)-1; - bss_resource.start = __pa_symbol(__bss_start); - bss_resource.end = __pa_symbol(__bss_stop)-1; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -866,30 +950,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_memblock_x86_reserve_range(); -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - if (movable_node_is_enabled()) - memblock_set_bottom_up(true); -#endif - x86_report_nx(); apic_setup_apic_calls(); @@ -901,7 +961,6 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } - e820__reserve_setup_data(); e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) @@ -921,11 +980,11 @@ void __init setup_arch(char **cmdline_p) tsc_early_init(); x86_init.resources.probe_roms(); - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &rodata_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); + /* + * Add resources for kernel text and data to the iomem_resource. + * Do it after parse_early_param, so it can be debugged. + */ + setup_kernel_resources(); e820_add_kernel_range(); trim_bios_range(); @@ -990,7 +1049,6 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); /* diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b30d6e180df7c..bfa48e7a32a24 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,18 +23,13 @@ #include #include -#ifdef CONFIG_X86_64 -#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) -#else -#define BOOT_PERCPU_OFFSET 0 -#endif +DEFINE_PER_CPU_CACHE_HOT(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); -DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, -}; +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init; EXPORT_SYMBOL(__per_cpu_offset); /* @@ -169,7 +164,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(pcpu_hot.cpu_number, cpu) = cpu; + per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index ef654530bf5a9..98123ff10506c 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -33,25 +33,55 @@ #include #include +/* + * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0 + * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index + * GDT, selector values 0~3 all point to the NULL descriptor, thus values + * 0, 1, 2 and 3 are all valid NULL selector values. + * + * However IRET zeros ES, FS, GS, and DS segment registers if any of them + * is found to have any nonzero NULL selector value, which can be used by + * userspace in pre-FRED systems to spot any interrupt/exception by loading + * a nonzero NULL selector and waiting for it to become zero. Before FRED + * there was nothing software could do to prevent such an information leak. + * + * ERETU, the only legit instruction to return to userspace from kernel + * under FRED, by design does NOT zero any segment register to avoid this + * problem behavior. + * + * As such, leave NULL selector values 0~3 unchanged. + */ +static inline u16 fixup_rpl(u16 sel) +{ + return sel <= 3 ? sel : sel | 3; +} + #ifdef CONFIG_IA32_EMULATION #include static inline void reload_segments(struct sigcontext_32 *sc) { - unsigned int cur; + u16 cur; + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ savesegment(gs, cur); - if ((sc->gs | 0x03) != cur) - load_gs_index(sc->gs | 0x03); + if (fixup_rpl(sc->gs) != cur) + load_gs_index(fixup_rpl(sc->gs)); savesegment(fs, cur); - if ((sc->fs | 0x03) != cur) - loadsegment(fs, sc->fs | 0x03); + if (fixup_rpl(sc->fs) != cur) + loadsegment(fs, fixup_rpl(sc->fs)); + savesegment(ds, cur); - if ((sc->ds | 0x03) != cur) - loadsegment(ds, sc->ds | 0x03); + if (fixup_rpl(sc->ds) != cur) + loadsegment(ds, fixup_rpl(sc->ds)); savesegment(es, cur); - if ((sc->es | 0x03) != cur) - loadsegment(es, sc->es | 0x03); + if (fixup_rpl(sc->es) != cur) + loadsegment(es, fixup_rpl(sc->es)); } #define sigset32_t compat_sigset_t @@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, regs->orig_ax = -1; #ifdef CONFIG_IA32_EMULATION - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ reload_segments(&sc); #else - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; + loadsegment(gs, fixup_rpl(sc.gs)); + regs->fs = fixup_rpl(sc.fs); + regs->es = fixup_rpl(sc.es); + regs->ds = fixup_rpl(sc.ds); #endif return fpu__restore_sig(compat_ptr(sc.fpstate), 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c10850ae6f094..42ca131bebbc3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -190,7 +190,7 @@ static void ap_starting(void) apic_ap_setup(); /* Save the processor parameters. */ - smp_store_cpu_info(cpuid); + identify_secondary_cpu(cpuid); /* * The topology information must be up to date before @@ -215,7 +215,7 @@ static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. - * smp_store_cpu_info() stored a value that is close but not as + * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, @@ -229,7 +229,7 @@ static void ap_calibrate_delay(void) /* * Activate a secondary processor. */ -static void notrace start_secondary(void *unused) +static void notrace __noendbr start_secondary(void *unused) { /* * Don't put *anything* except direct CPU state initialization @@ -314,26 +314,7 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ -void smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = &cpu_data(id); - - /* Copy boot_cpu_data only on the first bringup */ - if (!c->initialized) - *c = boot_cpu_data; - c->cpu_index = id; - /* - * During boot time, CPU0 has this setup already. Save the info when - * bringing up an AP. - */ - identify_secondary_cpu(c); - c->initialized = true; -} +ANNOTATE_NOENDBR_SYM(start_secondary); static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -841,7 +822,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(pcpu_hot.current_task, cpu) = idle; + per_cpu(current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ @@ -851,7 +832,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #endif return 0; } @@ -1262,43 +1243,9 @@ void play_dead_common(void) * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ -static inline void mwait_play_dead(void) +void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; @@ -1319,7 +1266,7 @@ static inline void mwait_play_dead(void) mb(); __monitor(md, 0, 0); mb(); - __mwait(eax, 0); + __mwait(eax_hint, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* @@ -1391,9 +1338,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c1bcb6053fc0..46b8f1f16676e 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -200,8 +200,7 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; for (i = 0; i < e820_table->nr_entries; i++) { - if ((e820_table->entries[i].type != E820_TYPE_RAM) - && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) + if (e820_table->entries[i].type != E820_TYPE_RAM) continue; add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2dbadf347b5f4..f4263cb3d21e6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -94,10 +94,20 @@ __always_inline int is_valid_bugaddr(unsigned long addr) /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. - * If it's a UD1, get the ModRM byte to pass along to UBSan. + * If it's a UD1, further decode to determine its use: + * + * FineIBT: ea (bad) + * FineIBT: f0 75 f9 lock jne . - 6 + * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax + * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax + * static_call: 0f b9 cc ud1 %esp,%ecx + * + * Notably UBSAN uses EAX, static_call uses ECX. */ -__always_inline int decode_bug(unsigned long addr, u32 *imm) +__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { + unsigned long start = addr; + bool lock = false; u8 v; if (addr < TASK_SIZE_MAX) @@ -106,28 +116,67 @@ __always_inline int decode_bug(unsigned long addr, u32 *imm) v = *(u8 *)(addr++); if (v == INSN_ASOP) v = *(u8 *)(addr++); - if (v != OPCODE_ESCAPE) + + if (v == INSN_LOCK) { + lock = true; + v = *(u8 *)(addr++); + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xea: + *len = addr - start; + return BUG_EA; + + case OPCODE_ESCAPE: + break; + + default: return BUG_NONE; + } v = *(u8 *)(addr++); - if (v == SECOND_BYTE_OPCODE_UD2) + if (v == SECOND_BYTE_OPCODE_UD2) { + *len = addr - start; return BUG_UD2; + } - if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) + if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; - /* Retrieve the immediate (type value) for the UBSAN UD1 */ - v = *(u8 *)(addr++); - if (X86_MODRM_RM(v) == 4) - addr++; - *imm = 0; - if (X86_MODRM_MOD(v) == 1) - *imm = *(u8 *)addr; - else if (X86_MODRM_MOD(v) == 2) - *imm = *(u32 *)addr; - else - WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); + v = *(u8 *)(addr++); /* ModRM */ + + if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) + addr++; /* SIB */ + + /* Decode immediate, if present */ + switch (X86_MODRM_MOD(v)) { + case 0: if (X86_MODRM_RM(v) == 5) + addr += 4; /* RIP + disp32 */ + break; + + case 1: *imm = *(s8 *)addr; + addr += 1; + break; + + case 2: *imm = *(s32 *)addr; + addr += 4; + break; + + case 3: break; + } + + /* record instruction length */ + *len = addr - start; + + if (X86_MODRM_REG(v) == 0) /* EAX */ + return BUG_UD1_UBSAN; return BUG_UD1; } @@ -257,11 +306,12 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; - int ud_type; - u32 imm; + int ud_type, ud_len; + s32 ud_imm; - ud_type = decode_bug(regs->ip, &imm); + ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; @@ -281,15 +331,47 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (ud_type == BUG_UD2) { - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; + + switch (ud_type) { + case BUG_UD2: + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; + break; } - } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { - pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); + fallthrough; + + case BUG_EA: + case BUG_LOCK: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + break; + + case BUG_UD1_UBSAN: + if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", + report_ubsan_failure(regs, ud_imm), + (void *)regs->ip); + } + break; + + default: + break; + } + + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; } + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 34dec0b72ea8d..88e5a4ed9db3a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -959,7 +959,7 @@ static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -979,7 +979,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index deeb028256709..48e6cc1cb017a 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5a952c5ea66bc..ff09211dada8d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -308,6 +309,11 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool return -ENOTSUPP; } +static int is_nop5_insn(uprobe_opcode_t *insn) +{ + return !memcmp(insn, x86_nops[5], 5); +} + #ifdef CONFIG_X86_64 asm ( @@ -338,7 +344,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -357,19 +363,23 @@ void *arch_uprobe_trampoline(unsigned long *psize) return &insn; } -static unsigned long trampoline_check_ip(void) +static unsigned long trampoline_check_ip(unsigned long tramp) { - unsigned long tramp = uprobe_get_trampoline_vaddr(); - return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); } SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3]; + unsigned long err, ip, sp, r11_cx_ax[3], tramp; - if (regs->ip != trampoline_check_ip()) + /* If there's no trampoline, we are called from wrong place. */ + tramp = uprobe_get_trampoline_vaddr(); + if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR)) + goto sigill; + + /* Make sure the ip matches the only allowed sys_uretprobe caller. */ + if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); @@ -425,6 +435,140 @@ SYSCALL_DEFINE0(uretprobe) return -1; } +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + unsigned long ip, sp, ax_r11_cx_ip[4]; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(ax_r11_cx_ip, (void __user *)regs->sp, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = ax_r11_cx_ip[0]; + regs->r11 = ax_r11_cx_ip[1]; + regs->cx = ax_r11_cx_ip[2]; + regs->ip = ax_r11_cx_ip[3] - 5; + regs->sp += sizeof(ax_r11_cx_ip); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) + return regs->ax; + + regs->sp -= sizeof(ax_r11_cx_ip); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + ax_r11_cx_ip[0] = regs->ax; + ax_r11_cx_ip[1] = regs->r11; + ax_r11_cx_ip[2] = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (ax_r11_cx_ip[3] - 5 != regs->ip) + ax_r11_cx_ip[3] = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, ax_r11_cx_ip, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses @@ -604,6 +748,391 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; + atomic64_t ref; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_page(unsigned long vaddr) +{ + struct vm_area_struct *vma, *prev = NULL; + unsigned long prev_vm_end = PAGE_SIZE; + VMA_ITERATOR(vmi, current->mm, 0); + + vma = vma_next(&vmi); + while (vma) { + if (prev) + prev_vm_end = prev->vm_end; + if (vma->vm_start - prev_vm_end >= PAGE_SIZE) { + if (is_reachable_by_call(prev_vm_end, vaddr)) + return prev_vm_end; + if (is_reachable_by_call(vma->vm_start - PAGE_SIZE, vaddr)) + return vma->vm_start - PAGE_SIZE; + } + prev = vma; + vma = vma_next(&vmi); + } + + return 0; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + const struct vm_special_mapping *mapping; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct uprobe_trampoline *tramp; + + mapping = user_64bit_mode(regs) ? &tramp_mapping : NULL; + if (!mapping) + return NULL; + + vaddr = find_nearest_page(vaddr); + if (!vaddr) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + atomic64_set(&tramp->ref, 1); + tramp->vaddr = vaddr; + + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + mapping); + if (IS_ERR(vma)) + goto free_area; + return tramp; + + free_area: + kfree(tramp); + return NULL; +} + +static struct uprobe_trampoline *uprobe_trampoline_get(unsigned long vaddr) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + atomic64_inc(&tramp->ref); + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + hlist_del(&tramp->node); + kfree(tramp); +} + +static void uprobe_trampoline_put(struct uprobe_trampoline *tramp) +{ + if (tramp == NULL) + return; + + if (atomic64_dec_and_test(&tramp->ref)) + destroy_uprobe_trampoline(tramp); +} + +enum { + OPT_PART, + OPT_INSN, + UNOPT_INT3, + UNOPT_PART, +}; + +struct write_opcode_ctx { + unsigned long base; + int update; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->update) { + case OPT_PART: + case OPT_INSN: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_INT3: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_PART: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +static int write_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *ctx) +{ + return uprobe_write(auprobe, mm, vaddr, insn, nbytes, verify_insn, false, ctx); +} + +static void relative_call(void *dest, long from, long to) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *insn; + + insn = (struct __arch_relative_insn *)dest; + insn->raddr = (s32)(to - (from + 5)); + insn->op = CALL_INSN_OPCODE; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, + unsigned long tramp) +{ + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + char call[5]; + int err; + + relative_call(call, vaddr, tramp); + + /* + * We are in state where breakpoint (int3) is installed on top of first + * byte of the nop5 instruction. We will do following steps to overwrite + * this to call instruction: + * + * - sync cores + * - write last 4 bytes of the call instruction + * - sync cores + * - update the call instruction opcode + */ + + text_poke_sync(); + + ctx.update = OPT_PART; + err = write_insn(auprobe, mm, vaddr + 1, call + 1, 4, &ctx); + if (err) + return err; + + text_poke_sync(); + + ctx.update = OPT_INSN; + return write_insn(auprobe, mm, vaddr, call, 1, &ctx); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * We need to overwrite call instruction into nop5 instruction with + * breakpoint (int3) installed on top of its first byte. We will: + * + * - overwrite call opcode with breakpoint (int3) + * - sync cores + * - write last 4 bytes of the nop5 instruction + * - sync cores + */ + + ctx.update = UNOPT_INT3; + err = write_insn(auprobe, mm, vaddr, &int3, 1, &ctx); + if (err) + return err; + + text_poke_sync(); + + ctx.update = UNOPT_PART; + err = write_insn(auprobe, mm, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx); + + text_poke_sync(); + return err; +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err || optimized) + return err; + } + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN, false); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +{ + /* + * We might be in race where we have optimized uprobe, but the uprobe + * was flagged as failed from another task, so we try to unoptimize + * the uprobe regardless the failed flag. + */ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + WARN_ON_ONCE(swbp_unoptimize(auprobe, mm, vaddr)); + } + return uprobe_write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn, true); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool emulate_nop5_insn(struct arch_uprobe *auprobe) +{ + return is_nop5_insn((uprobe_opcode_t *) &auprobe->insn); +} + +static int __arch_uprobe_optimize(struct mm_struct *mm, struct arch_uprobe *auprobe, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + int err = 0; + + tramp = uprobe_trampoline_get(vaddr); + if (!tramp) + return -1; + err = swbp_optimize(auprobe, mm, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err)) + uprobe_trampoline_put(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(mm, auprobe, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (!is_nop5_insn((uprobe_opcode_t *) &auprobe->insn)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -617,6 +1146,14 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool emulate_nop5_insn(struct arch_uprobe *auprobe) +{ + return false; +} +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -848,6 +1385,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) break; case 0x0f: + if (emulate_nop5_insn(auprobe)) + goto setup; if (insn->opcode.nbytes != 2) return -ENOSYS; /* @@ -978,6 +1517,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 1258a5872d128..37ad437924523 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -29,8 +29,12 @@ */ #include +#include #include +#define SSE_MASK \ + (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31)))) + SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0deb4887d6e96..ccdc45e5b7596 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -43,7 +43,8 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; -const_pcpu_hot = pcpu_hot; +const_current_task = current_task; +const_cpu_current_top_of_stack = cpu_current_top_of_stack; #if defined(CONFIG_X86_64) /* @@ -112,12 +113,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ -#ifdef CONFIG_X86_64 -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(6); /* RW_ */ -#endif - init PT_LOAD FLAGS(7); /* RWE */ -#endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -193,6 +188,8 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) + CACHE_HOT_DATA(L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA @@ -216,21 +213,7 @@ SECTIONS __init_begin = .; /* paired with __init_end */ } -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - .init.text - should - * start another segment - init. - */ - PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) - ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, - "per-CPU data too large - increase CONFIG_PHYSICAL_START") -#endif - INIT_TEXT_SECTION(PAGE_SIZE) -#ifdef CONFIG_X86_64 - :init -#endif /* * Section for code used exclusively before alternatives are run. All @@ -347,9 +330,8 @@ SECTIONS EXIT_DATA } -#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU_SECTION(INTERNODE_CACHE_BYTES) -#endif + PERCPU_SECTION(L1_CACHE_BYTES) + ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large") RUNTIME_CONST_VARIABLES RUNTIME_CONST(ptr, USER_PTR_MAX) @@ -493,19 +475,6 @@ SECTIONS PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); #ifdef CONFIG_X86_64 -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(fixed_percpu_data); -INIT_PER_CPU(irq_stack_backing_store); - -#ifdef CONFIG_SMP -. = ASSERT((fixed_percpu_data == 0), - "fixed_percpu_data is not at start of per-cpu area"); -#endif #ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8eb3a88707f21..121edf1f2a79a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6ebeb6cea6c0d..24f0318c50d79 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d7ab8780ab9e1..739aa6c0d0c36 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->kvm = kvm; pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; + hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a009c94c26c2a..eb56cd9895748 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2921,9 +2921,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - apic->lapic_timer.timer.function = apic_timer_fn; + hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d4ac4a1f8b81b..8160870398b90 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7460,7 +7460,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7471,13 +7471,14 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); - if (!nx_thread) - return; + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7485,10 +7486,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0dbb25442ec14..661108d65ee72 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ - if (sev_vcpu_has_debug_swap(svm)) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a713c803a3a37..e22ec946a79a7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,6 +607,9 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); + + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -684,6 +687,9 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + return 0; } @@ -1559,7 +1565,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) + if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && + static_branch_likely(&switch_vcpu_ibpb)) indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) @@ -3165,6 +3172,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * AMD changed the architectural behavior of bits 5:2. On CPUs + * without BusLockTrap, bits 5:2 control "external pins", but + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed + * the guest to set bits 5:2 despite not actually virtualizing + * Performance-Monitoring/Breakpoint external pins. Drop bits + * 5:2 for backwards compatibility. + */ + data &= ~GENMASK(5, 2); + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -4189,6 +4217,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4197,6 +4237,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4253,6 +4295,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4280,6 +4332,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9d7cdb8fbf872..ea44c1da5a7c9 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb13..0c61153b275f6 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8a7af02d466e9..85b2d483ffa20 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5084,6 +5084,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -5316,9 +5327,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6c56d5235f0f3..3dd9007ae6856 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1477,7 +1477,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * performs IBPB on nested VM-Exit (a single nested transition * may switch the active VMCS multiple times). */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + if (static_branch_likely(&switch_vcpu_ibpb) && + (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) indirect_branch_prediction_barrier(); } @@ -1514,16 +1515,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -7458,8 +7455,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8b111ce1087c7..951e44dc9d0ea 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -340,8 +340,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 633c87e2fd92e..96677576c836d 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -118,7 +118,7 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02159c967d29e..4b64ab350bcd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(0, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { @@ -12877,11 +12879,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } kvm_unload_vcpu_mmus(kvm); + kvm_destroy_vcpus(kvm); kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0d..1ac738dcf7a26 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2225,8 +2225,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8a59c61624c2f..64ccecedc9f81 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -56,7 +56,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += string_32.o lib-y += memmove_32.o lib-y += cmpxchg8b_emu.o -ifneq ($(CONFIG_X86_CMPXCHG64),y) +ifneq ($(CONFIG_X86_CX8),y) lib-y += atomic64_386_32.o endif else @@ -66,5 +66,6 @@ endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o - lib-y += cmpxchg16b_emu.o + lib-y += cmpxchg16b_emu.o + lib-y += bhi.o endif diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S new file mode 100644 index 0000000000000..58891681261b0 --- /dev/null +++ b/arch/x86/lib/bhi.S @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include + +/* + * Notably, the FineIBT preamble calling these will have ZF set and r10 zero. + * + * The very last element is in fact larger than 32 bytes, but since its the + * last element, this does not matter, + * + * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they + * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently + * big alignment holes for this to not stagger the array. + */ + +.pushsection .noinstr.text, "ax" + + .align 32 +SYM_CODE_START(__bhi_args) + +#ifdef CONFIG_FINEIBT_BHI + + .align 32 +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_1: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_2: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + cmovne %r10, %rsp + ANNOTATE_UNRET_SAFE + ret + int3 + +#endif /* CONFIG_FINEIBT_BHI */ + + .align 32 +SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + nop /* Work around toolchain+objtool quirk */ +SYM_CODE_END(__bhi_args) + +.popsection diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 2760a15fbc002..a508e4a8c66a2 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include #include +#include +#include #include /* @@ -14,7 +16,7 @@ * Zero a page. * %rdi - page */ -SYM_FUNC_START(clear_page_rep) +SYM_TYPED_FUNC_START(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq @@ -22,7 +24,7 @@ SYM_FUNC_START(clear_page_rep) SYM_FUNC_END(clear_page_rep) EXPORT_SYMBOL_GPL(clear_page_rep) -SYM_FUNC_START(clear_page_orig) +SYM_TYPED_FUNC_START(clear_page_orig) xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -44,7 +46,7 @@ SYM_FUNC_START(clear_page_orig) SYM_FUNC_END(clear_page_orig) EXPORT_SYMBOL_GPL(clear_page_orig) -SYM_FUNC_START(clear_page_erms) +SYM_TYPED_FUNC_START(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb @@ -63,6 +65,7 @@ EXPORT_SYMBOL_GPL(clear_page_erms) * rcx: uncleared bytes or 0 if successful. */ SYM_FUNC_START(rep_stos_alternative) + ANNOTATE_NOENDBR cmpq $64,%rcx jae .Lunrolled diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 1c96be769adc3..d4bb24347ff81 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,7 @@ .text -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 /* * Emulate 'cmpxchg8b (%esi)' on UP diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index d6ae793d08faf..d8e87fedc20dc 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ * prefetch distance based on SMP/UP. */ ALIGN -SYM_FUNC_START(copy_page) +SYM_TYPED_FUNC_START(copy_page) ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fc9fb5d061744..aa8c341b24419 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -30,6 +32,7 @@ * it simpler for us, we can clobber rsi/rdi and rax freely. */ SYM_FUNC_START(rep_movs_alternative) + ANNOTATE_NOENDBR cmpq $64,%rcx jae .Llarge diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S index 2918e36eece22..18350b343c2ab 100644 --- a/arch/x86/lib/copy_user_uncached_64.S +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -5,6 +5,7 @@ #include #include +#include #include /* @@ -27,6 +28,7 @@ * rax uncopied bytes or 0 if successful. */ SYM_FUNC_START(__copy_user_nocache) + ANNOTATE_NOENDBR /* If destination is not 7-byte aligned, we'll have to align it */ testb $7,%dil jne .Lalign diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 23f81ca3f06b4..e86eda2c0b040 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -131,7 +131,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles) * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu * variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 89ecd57c9d423..9d5654b8a72a8 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -28,22 +28,20 @@ #include #include +#include #include #include #include #include #include #include +#include #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC .macro check_range size:req .if IS_ENABLED(CONFIG_X86_64) - movq $0x0123456789abcdef,%rdx - 1: - .pushsection runtime_ptr_USER_PTR_MAX,"a" - .long 1b - 8 - . - .popsection + RUNTIME_CONST_PTR USER_PTR_MAX, rdx cmp %rdx, %rax cmova %rdx, %rax .else @@ -62,6 +60,7 @@ .text SYM_FUNC_START(__get_user_1) + ANNOTATE_NOENDBR check_range size=1 ASM_STAC UACCESS movzbl (%_ASM_AX),%edx @@ -72,6 +71,7 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) + ANNOTATE_NOENDBR check_range size=2 ASM_STAC UACCESS movzwl (%_ASM_AX),%edx @@ -82,6 +82,7 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) + ANNOTATE_NOENDBR check_range size=4 ASM_STAC UACCESS movl (%_ASM_AX),%edx @@ -92,6 +93,7 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) + ANNOTATE_NOENDBR #ifndef CONFIG_X86_64 xor %ecx,%ecx #endif @@ -111,6 +113,7 @@ EXPORT_SYMBOL(__get_user_8) /* .. and the same for __get_user, just without the range checks */ SYM_FUNC_START(__get_user_nocheck_1) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movzbl (%_ASM_AX),%edx @@ -121,6 +124,7 @@ SYM_FUNC_END(__get_user_nocheck_1) EXPORT_SYMBOL(__get_user_nocheck_1) SYM_FUNC_START(__get_user_nocheck_2) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movzwl (%_ASM_AX),%edx @@ -131,6 +135,7 @@ SYM_FUNC_END(__get_user_nocheck_2) EXPORT_SYMBOL(__get_user_nocheck_2) SYM_FUNC_START(__get_user_nocheck_4) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movl (%_ASM_AX),%edx @@ -141,6 +146,7 @@ SYM_FUNC_END(__get_user_nocheck_4) EXPORT_SYMBOL(__get_user_nocheck_4) SYM_FUNC_START(__get_user_nocheck_8) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC #ifdef CONFIG_X86_64 diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 774bdf3e6f0a9..edbeb3ecad38c 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include @@ -9,6 +10,7 @@ * %rdi: w */ SYM_FUNC_START(__sw_hweight32) + ANNOTATE_NOENDBR #ifdef CONFIG_X86_64 movl %edi, %eax # w @@ -42,6 +44,7 @@ EXPORT_SYMBOL(__sw_hweight32) */ #ifdef CONFIG_X86_64 SYM_FUNC_START(__sw_hweight64) + ANNOTATE_NOENDBR pushq %rdi pushq %rdx diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 1b60ae81ecd88..aa1f92ee6b2e7 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,6 +8,7 @@ */ #include #include +#include #include #include @@ -26,7 +27,7 @@ * Output: * rax: dest */ -SYM_FUNC_START(__memmove) +SYM_TYPED_FUNC_START(__memmove) mov %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 0199d56cb479d..d66b710d628f8 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -28,7 +29,7 @@ * only for the return value that is the same as the source input, * which the compiler could/should do much better anyway. */ -SYM_FUNC_START(__memset) +SYM_TYPED_FUNC_START(__memset) ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS movq %rdi,%r9 diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index ebd259f314963..5ef8494896e82 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include #include @@ -12,7 +13,7 @@ * */ .macro op_safe_regs op -SYM_FUNC_START(\op\()_safe_regs) +SYM_TYPED_FUNC_START(\op\()_safe_regs) pushq %rbx pushq %r12 movq %rdi, %r10 /* Save pointer */ diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 4bf4fad5b148e..5a18ecc04a6c3 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -103,6 +103,7 @@ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } +EXPORT_SYMBOL_GPL(msr_set_bit); /** * msr_clear_bit - Clear @bit in a MSR @msr. @@ -118,6 +119,7 @@ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } +EXPORT_SYMBOL_GPL(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(unsigned int msr, u64 val, int failed) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 975c9c18263d2..46d9e9b98a618 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -13,6 +13,7 @@ */ #include #include +#include #include #include #include @@ -45,6 +46,7 @@ .text SYM_FUNC_START(__put_user_1) + ANNOTATE_NOENDBR check_range size=1 ASM_STAC 1: movb %al,(%_ASM_CX) @@ -55,6 +57,7 @@ SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) SYM_FUNC_START(__put_user_nocheck_1) + ANNOTATE_NOENDBR ASM_STAC 2: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -64,6 +67,7 @@ SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) + ANNOTATE_NOENDBR check_range size=2 ASM_STAC 3: movw %ax,(%_ASM_CX) @@ -74,6 +78,7 @@ SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) SYM_FUNC_START(__put_user_nocheck_2) + ANNOTATE_NOENDBR ASM_STAC 4: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -83,6 +88,7 @@ SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) + ANNOTATE_NOENDBR check_range size=4 ASM_STAC 5: movl %eax,(%_ASM_CX) @@ -93,6 +99,7 @@ SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) SYM_FUNC_START(__put_user_nocheck_4) + ANNOTATE_NOENDBR ASM_STAC 6: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -102,6 +109,7 @@ SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) + ANNOTATE_NOENDBR check_range size=8 ASM_STAC 7: mov %_ASM_AX,(%_ASM_CX) @@ -115,6 +123,7 @@ SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) SYM_FUNC_START(__put_user_nocheck_8) + ANNOTATE_NOENDBR ASM_STAC 9: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 391059b2c6fbc..a26c43abd47d8 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -326,6 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret) #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) + ANNOTATE_NOENDBR ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) @@ -342,7 +343,7 @@ SYM_FUNC_START(call_depth_return_thunk) * case. */ CALL_THUNKS_DEBUG_INC_RETS - shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + shlq $5, PER_CPU_VAR(__x86_call_depth) jz 1f ANNOTATE_UNRET_SAFE ret diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e9251b89a9e93..654280aaa3e9e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -18,7 +18,7 @@ #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB - * @vaddr: virtual start address + * @addr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 5ab7bd2f1983c..bd5d101c5c379 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -101,9 +101,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, pmd_t *pmd; bool use_gbpage; - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; + next = pud_addr_end(addr, end); /* if this is already a gbpage, this portion is already mapped */ if (pud_leaf(*pud)) @@ -154,10 +152,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, p4d_t *p4d = p4d_page + p4d_index(addr); pud_t *pud; - next = (addr & P4D_MASK) + P4D_SIZE; - if (next > end) - next = end; - + next = p4d_addr_end(addr, end); if (p4d_present(*p4d)) { pud = pud_offset(p4d, 0); result = ident_pud_init(info, pud, addr, next); @@ -199,10 +194,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - + next = pgd_addr_end(addr, end); if (pgd_present(*pgd)) { p4d = p4d_offset(pgd, 0); result = ident_p4d_init(info, p4d, addr, next); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 62aa4d66a032d..bfa444a7dbb04 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -645,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_phys_free(addr, PMD_SIZE); - real_end = addr + PMD_SIZE; + if (!addr) { + pr_warn("Failed to release memory for alloc_low_pages()"); + real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE)); + } else { + memblock_phys_free(addr, PMD_SIZE); + real_end = addr + PMD_SIZE; + } /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d4..f288aad8dc74e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -582,7 +582,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" + "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -606,18 +606,13 @@ static void __init highmem_pfn_init(void) #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } -#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 01ea7c6df3036..519aa53114fac 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -468,8 +468,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PAGE_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_ACPI)) set_pte_init(pte, __pte(0), init); @@ -525,8 +523,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PMD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_ACPI)) set_pmd_init(pmd, __pmd(0), init); @@ -614,8 +610,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PUD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_ACPI)) set_pud_init(pud, __pud(0), init); @@ -703,8 +697,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & P4D_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_ACPI)) set_p4d_init(p4d, __p4d(0), init); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 38ff7791a9c72..42c90b4207738 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) +{ + if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (void __force *)ioremap_cache(phys_addr, size); + + return (void __force *)ioremap_encrypted(phys_addr, size); +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 11a93542d1983..3c306de52fd4d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index e25288ee33c2d..f8a33b25ae869 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -72,6 +72,7 @@ SYM_FUNC_START(sme_encrypt_execute) SYM_FUNC_END(sme_encrypt_execute) SYM_FUNC_START(__enc_copy) + ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e6c7686f443a0..9fce5b87b8c50 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -565,7 +565,7 @@ void __head sme_enable(struct boot_params *bp) } RIP_REL_REF(sme_me_mask) = me_mask; - physical_mask &= ~me_mask; - cc_vendor = CC_VENDOR_AMD; + RIP_REL_REF(physical_mask) &= ~me_mask; + RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; cc_set_mask(me_mask); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index b8a6ffffb4519..5ed2109211dab 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M; - gap_max = (task_size / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SIZE_128M, (task_size / 6) * 5); return PAGE_ALIGN(task_size - gap - rnd); } diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 3d2f7f0a6ed14..ad3c1feec990d 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -183,7 +183,7 @@ static int pageattr_test(void) break; case 1: - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); break; case 2: diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index feb8cc6a12bf2..3a9e6dd58e2f0 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -984,27 +984,54 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, return -EINVAL; } -/* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - * - * If the vma has a linear pfn mapping for the entire range, we get the prot - * from pte and reserve the entire vma range with single reserve_pfn_range call. - */ -int track_pfn_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { + const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; + int rc; - if (vma->vm_flags & VM_PAT) { - if (get_pat_info(vma, &paddr, &pgprot)) - return -EINVAL; - /* reserve the whole chunk covered by vma. */ - return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + if (!(src_vma->vm_flags & VM_PAT)) + return 0; + + /* + * Duplicate the PAT information for the dst VMA based on the src + * VMA. + */ + if (get_pat_info(src_vma, &paddr, &pgprot)) + return -EINVAL; + rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); + if (!rc) + /* Reservation for the destination VMA succeeded. */ + vm_flags_set(dst_vma, VM_PAT); + return rc; +} + +void untrack_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + resource_size_t paddr; + unsigned long size; + + if (!(dst_vma->vm_flags & VM_PAT)) + return; + + /* + * As the page tables might not have been copied yet, the PAT + * information is obtained from the src VMA, just like during + * track_pfn_copy(). + */ + if (get_pat_info(src_vma, &paddr, NULL)) { + size = src_vma->vm_end - src_vma->vm_start; + free_pfn_range(paddr, size); } - return 0; + /* + * Reservation was freed, any copied page tables will get cleaned + * up later, but without getting PAT involved again. + */ + vm_flags_clear(dst_vma, VM_PAT); } /* @@ -1095,15 +1122,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } } -/* - * untrack_pfn_clear is called if the following situation fits: - * - * 1) while mremapping a pfnmap for a new region, with the old vma after - * its pfnmap page table has been removed. The new vma has a new pfnmap - * to the same pfn & cache type with VM_PAT set. - * 2) while duplicating vm area, the new vma fails to copy the pgtable from - * old vma. - */ void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ef4514d64c052..72405d315b414 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -73,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +#define CPA_COLLAPSE 16 /* try to collapse large pages */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { @@ -105,6 +106,18 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } +static void collapse_page_count(int level) +{ + direct_pages_count[level]++; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); + } + direct_pages_count[level - 1] -= PTRS_PER_PTE; +} + void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", @@ -122,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m) } #else static inline void split_page_count(int level) { } +static inline void collapse_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS @@ -211,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +#ifdef CONFIG_X86_64 + static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } -#ifdef CONFIG_X86_64 - /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): @@ -394,16 +408,49 @@ static void __cpa_flush_tlb(void *data) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } -static void cpa_flush(struct cpa_data *data, int cache) +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); + +static void cpa_collapse_large_pages(struct cpa_data *cpa) +{ + unsigned long start, addr, end; + struct ptdesc *ptdesc, *tmp; + LIST_HEAD(pgtables); + int collapsed = 0; + int i; + + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + for (i = 0; i < cpa->numpages; i++) + collapsed += collapse_large_pages(__cpa_addr(cpa, i), + &pgtables); + } else { + addr = __cpa_addr(cpa, 0); + start = addr & PMD_MASK; + end = addr + PAGE_SIZE * cpa->numpages; + + for (addr = start; within(addr, start, end); addr += PMD_SIZE) + collapsed += collapse_large_pages(addr, &pgtables); + } + + if (!collapsed) + return; + + flush_tlb_all(); + + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } +} + +static void cpa_flush(struct cpa_data *cpa, int cache) { - struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + goto collapse_large_pages; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) @@ -412,7 +459,7 @@ static void cpa_flush(struct cpa_data *data, int cache) on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) - return; + goto collapse_large_pages; mb(); for (i = 0; i < cpa->numpages; i++) { @@ -428,6 +475,10 @@ static void cpa_flush(struct cpa_data *data, int cache) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); + +collapse_large_pages: + if (cpa->flags & CPA_COLLAPSE) + cpa_collapse_large_pages(cpa); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, @@ -1197,6 +1248,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } +static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, + struct list_head *pgtables) +{ + pmd_t _pmd, old_pmd; + pte_t *pte, first; + unsigned long pfn; + pgprot_t pgprot; + int i = 0; + + addr &= PMD_MASK; + pte = pte_offset_kernel(pmd, addr); + first = *pte; + pfn = pte_pfn(first); + + /* Make sure alignment is suitable */ + if (PFN_PHYS(pfn) & ~PMD_MASK) + return 0; + + /* The page is 4k intentionally */ + if (pte_flags(first) & _PAGE_KERNEL_4K) + return 0; + + /* Check that the rest of PTEs are compatible with the first one */ + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { + pte_t entry = *pte; + + if (!pte_present(entry)) + return 0; + if (pte_flags(entry) != pte_flags(first)) + return 0; + if (pte_pfn(entry) != pte_pfn(first) + i) + return 0; + } + + old_pmd = *pmd; + + /* Success: set up a large page */ + pgprot = pgprot_4k_2_large(pte_pgprot(first)); + pgprot_val(pgprot) |= _PAGE_PSE; + _pmd = pfn_pmd(pfn, pgprot); + set_pmd(pmd, _pmd); + + /* Queue the page table to be freed after TLB flush */ + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); + + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + struct page *page; + + /* Update all PGD tables to use the same large page */ + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + /* Something is wrong if entries doesn't match */ + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) + continue; + set_pmd(pmd, _pmd); + } + } + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_2M); + + return 1; +} + +static int collapse_pud_page(pud_t *pud, unsigned long addr, + struct list_head *pgtables) +{ + unsigned long pfn; + pmd_t *pmd, first; + int i; + + if (!direct_gbpages) + return 0; + + addr &= PUD_MASK; + pmd = pmd_offset(pud, addr); + first = *pmd; + + /* + * To restore PUD page all PMD entries must be large and + * have suitable alignment + */ + pfn = pmd_pfn(first); + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) + return 0; + + /* + * To restore PUD page, all following PMDs must be compatible with the + * first one. + */ + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { + pmd_t entry = *pmd; + + if (!pmd_present(entry) || !pmd_leaf(entry)) + return 0; + if (pmd_flags(entry) != pmd_flags(first)) + return 0; + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) + return 0; + } + + /* Restore PUD page and queue page table to be freed after TLB flush */ + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_1G); + + return 1; +} + +/* + * Collapse PMD and PUD pages in the kernel mapping around the address where + * possible. + * + * Caller must flush TLB and free page tables queued on the list before + * touching the new entries. CPU must not see TLB entries of different size + * with different attributes. + */ +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) +{ + int collapsed = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + addr &= PMD_MASK; + + spin_lock(&pgd_lock); + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + goto out; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + goto out; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud) || pud_leaf(*pud)) + goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) + goto out; + + collapsed = collapse_pmd_page(pmd, addr, pgtables); + if (collapsed) + collapsed += collapse_pud_page(pud, addr, pgtables); + +out: + spin_unlock(&pgd_lock); + return collapsed; +} + static bool try_to_free_pte_page(pte_t *pte) { int i; @@ -1942,19 +2148,6 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, CPA_PAGES_ARRAY, pages); } -/* - * __set_memory_prot is an internal helper for callers that have been passed - * a pgprot_t value from upper layers and a reservation has already been taken. - * If you want to set the pgprot to a specific page protocol, use the - * set_memory_xx() functions. - */ -int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) -{ - return change_page_attr_set_clr(&addr, numpages, prot, - __pgprot(~pgprot_val(prot)), 0, 0, - NULL); -} - int _set_memory_uc(unsigned long addr, int numpages) { /* @@ -2120,7 +2313,8 @@ int set_memory_rox(unsigned long addr, int numpages) if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; - return change_page_attr_clear(&addr, numpages, clr, 0); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, + CPA_COLLAPSE, NULL); } int set_memory_rw(unsigned long addr, int numpages) @@ -2147,7 +2341,8 @@ int set_memory_p(unsigned long addr, int numpages) int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + return change_page_attr_set_clr(&addr, numpages, + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL); } @@ -2420,7 +2615,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2507,7 +2702,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2550,7 +2745,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1fef5ad32d5a8..cec321fb74f21 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -12,59 +12,15 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#ifdef CONFIG_HIGHPTE -#define PGTABLE_HIGHMEM __GFP_HIGHMEM -#else -#define PGTABLE_HIGHMEM 0 -#endif - -#ifndef CONFIG_PARAVIRT -#ifndef CONFIG_PT_RECLAIM -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif /* !CONFIG_PT_RECLAIM */ -#endif /* !CONFIG_PARAVIRT */ - -gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; - pgtable_t pte_alloc_one(struct mm_struct *mm) { - return __pte_alloc_one(mm, __userpte_alloc_gfp); -} - -static int __init setup_userpte(char *arg) -{ - if (!arg) - return -EINVAL; - - /* - * "userpte=nohigh" disables allocation of user pagetables in - * high memory. - */ - if (strcmp(arg, "nohigh") == 0) - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - else - return -EINVAL; - return 0; + return __pte_alloc_one(mm, GFP_PGTABLE_USER); } -early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -78,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6cf881a942bbe..3e2c6e2f5134c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -74,13 +74,15 @@ * use different names for each of them: * * ASID - [0, TLB_NR_DYN_ASIDS-1] - * the canonical identifier for an mm + * the canonical identifier for an mm, dynamically allocated on each CPU + * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] + * the canonical, global identifier for an mm, identical across all CPUs * - * kPCID - [1, TLB_NR_DYN_ASIDS] + * kPCID - [1, MAX_ASID_AVAILABLE] * the value we write into the PCID part of CR3; corresponds to the * ASID+1, because PCID 0 is special. * - * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] * for KPTI each mm has two address spaces and thus needs two * PCID values, but we can still do with a single ASID denomination * for each mm. Corresponds to kPCID + 2048. @@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + /* + * TLB consistency for global ASIDs is maintained with hardware assisted + * remote TLB flushing. Global ASIDs are always up to date. + */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + u16 global_asid = mm_global_asid(next); + + if (global_asid) { + *new_asid = global_asid; + *need_flush = false; + return; + } + } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) clear_asid_other(); @@ -251,6 +267,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +/* + * Global ASIDs are allocated for multi-threaded processes that are + * active on multiple CPUs simultaneously, giving each of those + * processes the same PCID on every CPU, for use with hardware-assisted + * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. + * + * These global ASIDs are held for the lifetime of the process. + */ +static DEFINE_RAW_SPINLOCK(global_asid_lock); +static u16 last_global_asid = MAX_ASID_AVAILABLE; +static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); +static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); +static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; + +/* + * When the search for a free ASID in the global ASID space reaches + * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously + * freed global ASIDs are safe to re-use. + * + * This way the global flush only needs to happen at ASID rollover + * time, and not at ASID allocation time. + */ +static void reset_global_asid_space(void) +{ + lockdep_assert_held(&global_asid_lock); + + invlpgb_flush_all_nonglobals(); + + /* + * The TLB flush above makes it safe to re-use the previously + * freed global ASIDs. + */ + bitmap_andnot(global_asid_used, global_asid_used, + global_asid_freed, MAX_ASID_AVAILABLE); + bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); + + /* Restart the search from the start of global ASID space. */ + last_global_asid = TLB_NR_DYN_ASIDS; +} + +static u16 allocate_global_asid(void) +{ + u16 asid; + + lockdep_assert_held(&global_asid_lock); + + /* The previous allocation hit the edge of available address space */ + if (last_global_asid >= MAX_ASID_AVAILABLE - 1) + reset_global_asid_space(); + + asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); + + if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { + /* This should never happen. */ + VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", + global_asid_available); + return 0; + } + + /* Claim this global ASID. */ + __set_bit(asid, global_asid_used); + last_global_asid = asid; + global_asid_available--; + return asid; +} + +/* + * Check whether a process is currently active on more than @threshold CPUs. + * This is a cheap estimation on whether or not it may make sense to assign + * a global ASID to this process, and use broadcast TLB invalidation. + */ +static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) +{ + int count = 0; + int cpu; + + /* This quick check should eliminate most single threaded programs. */ + if (cpumask_weight(mm_cpumask(mm)) <= threshold) + return false; + + /* Slower check to make sure. */ + for_each_cpu(cpu, mm_cpumask(mm)) { + /* Skip the CPUs that aren't really running this process. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) + continue; + + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + continue; + + if (++count > threshold) + return true; + } + return false; +} + +/* + * Assign a global ASID to the current process, protecting against + * races between multiple threads in the process. + */ +static void use_global_asid(struct mm_struct *mm) +{ + u16 asid; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* This process is already using broadcast TLB invalidation. */ + if (mm_global_asid(mm)) + return; + + /* + * The last global ASID was consumed while waiting for the lock. + * + * If this fires, a more aggressive ASID reuse scheme might be + * needed. + */ + if (!global_asid_available) { + VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); + return; + } + + asid = allocate_global_asid(); + if (!asid) + return; + + mm_assign_global_asid(mm, asid); +} + +void mm_free_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + if (!mm_global_asid(mm)) + return; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* The global ASID can be re-used only after flush at wrap-around. */ +#ifdef CONFIG_BROADCAST_TLB_FLUSH + __set_bit(mm->context.global_asid, global_asid_freed); + + mm->context.global_asid = 0; + global_asid_available++; +#endif +} + +/* + * Is the mm transitioning from a CPU-local ASID to a global ASID? + */ +static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid) +{ + u16 global_asid = mm_global_asid(mm); + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + /* Process is transitioning to a global ASID */ + if (global_asid && asid != global_asid) + return true; + + return false; +} + +/* + * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 + * systems have over 8k CPUs. Because of this potential ASID shortage, + * global ASIDs are handed out to processes that have frequent TLB + * flushes and are active on 4 or more CPUs simultaneously. + */ +static void consider_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + /* Check every once in a while. */ + if ((current->pid & 0x1f) != (jiffies & 0x1f)) + return; + + /* + * Assign a global ASID if the process is active on + * 4 or more CPUs simultaneously. + */ + if (mm_active_cpus_exceeds(mm, 3)) + use_global_asid(mm); +} + +static void finish_asid_transition(struct flush_tlb_info *info) +{ + struct mm_struct *mm = info->mm; + int bc_asid = mm_global_asid(mm); + int cpu; + + if (!mm_in_asid_transition(mm)) + return; + + for_each_cpu(cpu, mm_cpumask(mm)) { + /* + * The remote CPU is context switching. Wait for that to + * finish, to catch the unlikely case of it switching to + * the target mm with an out of date ASID. + */ + while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) + cpu_relax(); + + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) + continue; + + /* + * If at least one CPU is not using the global ASID yet, + * send a TLB flush IPI. The IPI should cause stragglers + * to transition soon. + * + * This can race with the CPU switching to another task; + * that results in a (harmless) extra IPI. + */ + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { + flush_tlb_multi(mm_cpumask(info->mm), info); + return; + } + } + + /* All the CPUs running this process are using the global ASID. */ + mm_clear_asid_transition(mm); +} + +static void broadcast_tlb_flush(struct flush_tlb_info *info) +{ + bool pmd = info->stride_shift == PMD_SHIFT; + unsigned long asid = mm_global_asid(info->mm); + unsigned long addr = info->start; + + /* + * TLB flushes with INVLPGB are kicked off asynchronously. + * The inc_mm_tlb_gen() guarantees page table updates are done + * before these TLB flushes happen. + */ + if (info->end == TLB_FLUSH_ALL) { + invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_single_pcid_nosync(user_pcid(asid)); + } else do { + unsigned long nr = 1; + + if (info->stride_shift <= PMD_SHIFT) { + nr = (info->end - addr) >> info->stride_shift; + nr = clamp_val(nr, 1, invlpgb_count_max); + } + + invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); + + addr += nr << info->stride_shift; + } while (addr < info->end); + + finish_asid_transition(info); + + /* Wait for the INVLPGBs kicked off above to finish. */ + __tlbsync(); +} + /* * Given an ASID, flush the corresponding user ASID. We can delay this * until the next time we switch to it. @@ -447,8 +725,7 @@ static void cond_mitigation(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != - (unsigned long)next->mm) + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm) indirect_branch_prediction_barrier(); } @@ -556,7 +833,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, */ if (prev == next) { /* Not actually switching mm's */ - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + VM_WARN_ON(is_dyn_asid(prev_asid) && + this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* @@ -573,6 +851,20 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); + /* Check if the current mm is transitioning to a global ASID */ + if (mm_needs_global_asid(next, prev_asid)) { + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + goto reload_tlb; + } + + /* + * Broadcast TLB invalidation keeps this ASID up to date + * all the time. + */ + if (is_global_asid(prev_asid)) + return; + /* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same @@ -606,6 +898,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, */ cond_mitigation(tsk); + /* + * Let nmi_uaccess_okay() and finish_asid_transition() + * know that CR3 is changing. + */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + /* * Leave this CPU in prev's mm_cpumask. Atomic writes to * mm_cpumask can be expensive under contention. The CPU @@ -620,14 +919,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - - /* Let nmi_uaccess_okay() know that we're changing CR3. */ - this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); - barrier(); } +reload_tlb: new_lam = mm_lam_cr3_mask(next); if (need_flush) { + VM_WARN_ON_ONCE(is_global_asid(new_asid)); this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, new_lam, true); @@ -746,7 +1043,7 @@ static void flush_tlb_func(void *info) const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + u64 local_tlb_gen; bool local = smp_processor_id() == f->initiating_cpu; unsigned long nr_invalidate = 0; u64 mm_tlb_gen; @@ -769,6 +1066,16 @@ static void flush_tlb_func(void *info) if (unlikely(loaded_mm == &init_mm)) return; + /* Reload the ASID if transitioning into or out of a global ASID */ + if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) { + switch_mm_irqs_off(NULL, loaded_mm, NULL); + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + } + + /* Broadcast ASIDs are always kept up to date with INVLPGB. */ + if (is_global_asid(loaded_mm_asid)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); @@ -786,6 +1093,8 @@ static void flush_tlb_func(void *info) return; } + local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && f->new_tlb_gen <= local_tlb_gen)) { /* @@ -953,7 +1262,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, * up on the new contents of what used to be page tables, while * doing a speculative memory access. */ - if (info->freed_tables) + if (info->freed_tables || mm_in_asid_transition(info->mm)) on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, @@ -1000,6 +1309,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); #endif + /* + * If the number of flushes is so large that a full flush + * would be faster, do a full flush. + */ + if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; + } + info->start = start; info->end = end; info->mm = mm; @@ -1026,17 +1344,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, bool freed_tables) { struct flush_tlb_info *info; + int cpu = get_cpu(); u64 new_tlb_gen; - int cpu; - - cpu = get_cpu(); - - /* Should we flush just the requested range? */ - if ((end == TLB_FLUSH_ALL) || - ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { - start = 0; - end = TLB_FLUSH_ALL; - } /* This is also a barrier that synchronizes with switch_mm(). */ new_tlb_gen = inc_mm_tlb_gen(mm); @@ -1049,9 +1358,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + if (mm_global_asid(mm)) { + broadcast_tlb_flush(info); + } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { info->trim_cpumask = should_trim_cpumask(mm); flush_tlb_multi(mm_cpumask(mm), info); + consider_global_asid(mm); } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); local_irq_disable(); @@ -1064,7 +1376,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } - static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -1074,7 +1385,32 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - on_each_cpu(do_flush_tlb_all, NULL, 1); + + /* First try (faster) hardware-assisted TLB invalidation. */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else + /* Fall back to the IPI-based invalidation. */ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +/* Flush an arbitrarily large range of memory with INVLPGB. */ +static void invlpgb_kernel_range_flush(struct flush_tlb_info *info) +{ + unsigned long addr, nr; + + for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { + nr = (info->end - addr) >> PAGE_SHIFT; + + /* + * INVLPGB has a limit on the size of ranges it can + * flush. Break up large flushes. + */ + nr = clamp_val(nr, 1, invlpgb_count_max); + + invlpgb_flush_addr_nosync(addr, nr); + } + __tlbsync(); } static void do_kernel_range_flush(void *info) @@ -1087,24 +1423,37 @@ static void do_kernel_range_flush(void *info) flush_tlb_one_kernel(addr); } -void flush_tlb_kernel_range(unsigned long start, unsigned long end) +static void kernel_tlb_flush_all(struct flush_tlb_info *info) { - /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else on_each_cpu(do_flush_tlb_all, NULL, 1); - } else { - struct flush_tlb_info *info; - - preempt_disable(); - info = get_flush_tlb_info(NULL, start, end, 0, false, - TLB_GENERATION_INVALID); +} +static void kernel_tlb_flush_range(struct flush_tlb_info *info) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_kernel_range_flush(info); + else on_each_cpu(do_kernel_range_flush, info, 1); +} - put_flush_tlb_info(); - preempt_enable(); - } +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + struct flush_tlb_info *info; + + guard(preempt)(); + + info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, + TLB_GENERATION_INVALID); + + if (info->end == TLB_FLUSH_ALL) + kernel_tlb_flush_all(info); + else + kernel_tlb_flush_range(info); + + put_flush_tlb_info(); } /* @@ -1283,7 +1632,9 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ - if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + invlpgb_flush_all_nonglobals(); + } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { flush_tlb_multi(&batch->cpumask, info); } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { lockdep_assert_irqs_enabled(); @@ -1325,7 +1676,7 @@ bool nmi_uaccess_okay(void) if (loaded_mm != current_mm) return false; - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa()); return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d3491cc0898bf..9e5fe2ba858f0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -410,16 +410,20 @@ static void emit_nops(u8 **pprog, int len) * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c */ +static int emit_call(u8 **prog, void *func, void *ip); -static void emit_fineibt(u8 **pprog, u32 hash) +static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; EMIT_ENDBR(); EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */ - EMIT2(0x74, 0x07); /* jz.d8 +7 */ - EMIT2(0x0f, 0x0b); /* ud2 */ - EMIT1(0x90); /* nop */ + if (cfi_bhi) { + emit_call(&prog, __bhi_args[arity], ip + 11); + } else { + EMIT2(0x75, 0xf9); /* jne.d8 .-7 */ + EMIT3(0x0f, 0x1f, 0x00); /* nop3 */ + } EMIT_ENDBR_POISON(); *pprog = prog; @@ -448,13 +452,13 @@ static void emit_kcfi(u8 **pprog, u32 hash) *pprog = prog; } -static void emit_cfi(u8 **pprog, u32 hash) +static void emit_cfi(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; switch (cfi_mode) { case CFI_FINEIBT: - emit_fineibt(&prog, hash); + emit_fineibt(&prog, ip, hash, arity); break; case CFI_KCFI: @@ -505,13 +509,17 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, +static void emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb) { u8 *prog = *pprog; - emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash); + if (is_subprog) { + emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5); + } else { + emit_cfi(&prog, ip, cfi_bpf_hash, 1); + } /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ @@ -641,7 +649,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * See emit_prologue(), for IBT builds the trampoline hook is preceded * with an ENDBR instruction. */ - if (is_endbr(*(u32 *)ip)) + if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; return __bpf_arch_text_poke(ip, t, old_addr, new_addr); @@ -1524,7 +1532,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, stack_depth, + emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and @@ -3105,7 +3113,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - if (is_endbr(*(u32 *)orig_call)) + if (is_endbr(orig_call)) orig_call += ENDBR_INSN_SIZE; orig_call += X86_PATCH_SIZE; } @@ -3116,7 +3124,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* * Indirect call for bpf_struct_ops */ - emit_cfi(&prog, cfi_get_func_hash(func_addr)); + emit_cfi(&prog, image, + cfi_get_func_hash(func_addr), + cfi_get_func_arity(func_addr)); } else { /* * Direct-call fentry stub, as such it needs accounting for the diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 48bcada5cabe5..4933fb3379832 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -12,8 +12,6 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_STA2X11) += sta2x11-fixup.o - obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c deleted file mode 100644 index 8c8ddc4dcc08d..0000000000000 --- a/arch/x86/pci/sta2x11-fixup.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping - * - * ST Microelectronics ConneXt (STA2X11/STA2X10) - * - * Copyright (c) 2010-2011 Wind River Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define STA2X11_SWIOTLB_SIZE (4*1024*1024) - -/* - * We build a list of bus numbers that are under the ConneXt. The - * main bridge hosts 4 busses, which are the 4 endpoints, in order. - */ -#define STA2X11_NR_EP 4 /* 0..3 included */ -#define STA2X11_NR_FUNCS 8 /* 0..7 included */ -#define STA2X11_AMBA_SIZE (512 << 20) - -struct sta2x11_ahb_regs { /* saved during suspend */ - u32 base, pexlbase, pexhbase, crw; -}; - -struct sta2x11_mapping { - int is_suspended; - struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS]; -}; - -struct sta2x11_instance { - struct list_head list; - int bus0; - struct sta2x11_mapping map[STA2X11_NR_EP]; -}; - -static LIST_HEAD(sta2x11_instance_list); - -/* At probe time, record new instances of this bridge (likely one only) */ -static void sta2x11_new_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = kzalloc(sizeof(*instance), GFP_ATOMIC); - if (!instance) - return; - /* This has a subordinate bridge, with 4 more-subordinate ones */ - instance->bus0 = pdev->subordinate->number + 1; - - if (list_empty(&sta2x11_instance_list)) { - int size = STA2X11_SWIOTLB_SIZE; - /* First instance: register your own swiotlb area */ - dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA, NULL)) - dev_emerg(&pdev->dev, "init swiotlb failed\n"); - } - list_add(&instance->list, &sta2x11_instance_list); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance); - -/* - * Utility functions used in this file from below - */ -static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - list_for_each_entry(instance, &sta2x11_instance_list, list) { - ep = pdev->bus->number - instance->bus0; - if (ep >= 0 && ep < STA2X11_NR_EP) - return instance; - } - return NULL; -} - -static int sta2x11_pdev_to_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return -1; - - return pdev->bus->number - instance->bus0; -} - -/* This is exported, as some devices need to access the MFD registers */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev) -{ - return sta2x11_pdev_to_instance(pdev); -} -EXPORT_SYMBOL(sta2x11_get_instance); - -/* At setup time, we use our own ops if the device is a ConneXt one */ -static void sta2x11_setup_pdev(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - - if (!instance) /* either a sta2x11 bridge or another ST device */ - return; - - /* We must enable all devices as master, for audio DMA to work */ - pci_set_master(pdev); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev); - -/* - * At boot we must set up the mappings for the pcie-to-amba bridge. - * It involves device access, and the same happens at suspend/resume time - */ - -#define AHB_MAPB 0xCA4 -#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10) -#define AHB_CRW_SZMASK 0xfffffc00UL -#define AHB_CRW_ENABLE (1 << 0) -#define AHB_CRW_WTYPE_MEM (2 << 1) -#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */ -#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */ -#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10) -#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10) -#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10) - -/* At probe time, enable mapping for each endpoint, using the pdev */ -static void sta2x11_map_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - struct device *dev = &pdev->dev; - u32 amba_base, max_amba_addr; - int i, ret; - - if (!instance) - return; - - pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); - max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - - ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); - if (ret) - dev_err(dev, "sta2x11: could not set DMA offset\n"); - - dev->bus_dma_limit = max_amba_addr; - dma_set_mask_and_coherent(&pdev->dev, max_amba_addr); - - /* Configure AHB mapping */ - pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); - pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0); - pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE | - AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE); - - /* Disable all the other windows */ - for (i = 1; i < STA2X11_NR_FUNCS; i++) - pci_write_config_dword(pdev, AHB_CRW(i), 0); - - dev_info(&pdev->dev, - "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n", - sta2x11_pdev_to_ep(pdev), amba_base, max_amba_addr); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep); - -#ifdef CONFIG_PM /* Some register values must be saved and restored */ - -static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return NULL; - ep = sta2x11_pdev_to_ep(pdev); - return instance->map + ep; -} - -static void suspend_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - if (map->is_suspended) - return; - map->is_suspended = 1; - - /* Save all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_read_config_dword(pdev, AHB_BASE(i), ®s->base); - pci_read_config_dword(pdev, AHB_PEXLBASE(i), ®s->pexlbase); - pci_read_config_dword(pdev, AHB_PEXHBASE(i), ®s->pexhbase); - pci_read_config_dword(pdev, AHB_CRW(i), ®s->crw); - } -} -DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping); - -static void resume_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - - if (!map->is_suspended) - goto out; - map->is_suspended = 0; - - /* Restore all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_write_config_dword(pdev, AHB_BASE(i), regs->base); - pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase); - pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase); - pci_write_config_dword(pdev, AHB_CRW(i), regs->crw); - } -out: - pci_set_master(pdev); /* Like at boot, enable master on all devices */ -} -DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping); - -#endif /* CONFIG_PM */ diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index ccb23c73cbe8b..63066e7c85177 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index cf5dca2dbb91d..e108ce7dad6a1 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -215,13 +215,12 @@ static u32 __init olpc_dt_get_board_revision(void) static int __init olpc_dt_compatible_match(phandle node, const char *compat) { char buf[64], *p; - int plen, len; + int plen; plen = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf)); if (plen <= 0) return 0; - len = strlen(compat); for (p = buf; p < buf + plen; p += strlen(p) + 1) { if (strcmp(p, compat) == 0) return 1; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 4733a5f467b81..cfa18ec7d55f1 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -173,10 +173,14 @@ SYM_CODE_START(pvh_start_xen) 1: UNWIND_HINT_END_OF_STACK - /* Set base address in stack canary descriptor. */ - mov $MSR_GS_BASE,%ecx - leal canary(%rip), %eax - xor %edx, %edx + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ + movl $MSR_GS_BASE,%ecx + xorl %eax, %eax + xorl %edx, %edx wrmsr /* Call xen_prepare_pvh() via the kernel virtual mapping */ @@ -238,8 +242,6 @@ SYM_DATA_START_LOCAL(gdt_start) SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 -SYM_DATA_LOCAL(canary, .fill 48, 1, 0) - SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 0a0539e1cc814..8c534c36adfa2 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -26,6 +26,7 @@ /* code below belongs to the image kernel */ .align PAGE_SIZE SYM_FUNC_START(restore_registers) + ANNOTATE_NOENDBR /* go back to the original page tables */ movq %r9, %cr3 @@ -119,6 +120,7 @@ SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ SYM_FUNC_START(core_restore_code) + ANNOTATE_NOENDBR /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ diff --git a/arch/x86/tools/cpufeaturemasks.awk b/arch/x86/tools/cpufeaturemasks.awk new file mode 100755 index 0000000000000..173d5bf2d999f --- /dev/null +++ b/arch/x86/tools/cpufeaturemasks.awk @@ -0,0 +1,88 @@ +#!/usr/bin/awk +# +# Convert cpufeatures.h to a list of compile-time masks +# Note: this blithely assumes that each word has at least one +# feature defined in it; if not, something else is wrong! +# + +BEGIN { + printf "#ifndef _ASM_X86_CPUFEATUREMASKS_H\n"; + printf "#define _ASM_X86_CPUFEATUREMASKS_H\n\n"; + + file = 0 +} + +FNR == 1 { + ++file; + + # arch/x86/include/asm/cpufeatures.h + if (file == 1) + FS = "[ \t()*+]+"; + + # .config + if (file == 2) + FS = "="; +} + +# Create a dictionary of sorts, containing all defined feature bits +file == 1 && $1 ~ /^#define$/ && $2 ~ /^X86_FEATURE_/ { + nfeat = $3 * $4 + $5; + feat = $2; + sub(/^X86_FEATURE_/, "", feat); + feats[nfeat] = feat; +} +file == 1 && $1 ~ /^#define$/ && $2 == "NCAPINTS" { + ncapints = int($3); +} + +# Create a dictionary featstat[REQUIRED|DISABLED, FEATURE_NAME] = on | off +file == 2 && $1 ~ /^CONFIG_X86_(REQUIRED|DISABLED)_FEATURE_/ { + on = ($2 == "y"); + if (split($1, fs, "CONFIG_X86_|_FEATURE_") == 3) + featstat[fs[2], fs[3]] = on; +} + +END { + sets[1] = "REQUIRED"; + sets[2] = "DISABLED"; + + for (ns in sets) { + s = sets[ns]; + + printf "/*\n"; + printf " * %s features:\n", s; + printf " *\n"; + fstr = ""; + for (i = 0; i < ncapints; i++) { + mask = 0; + for (j = 0; j < 32; j++) { + feat = feats[i*32 + j]; + if (featstat[s, feat]) { + nfstr = fstr " " feat; + if (length(nfstr) > 72) { + printf " * %s\n", fstr; + nfstr = " " feat; + } + fstr = nfstr; + mask += (2 ^ j); + } + } + masks[i] = mask; + } + printf " * %s\n */\n", fstr; + + for (i = 0; i < ncapints; i++) + printf "#define %s_MASK%d\t0x%08xU\n", s, i, masks[i]; + + printf "\n#define %s_MASK_BIT_SET(x)\t\t\t\\\n", s; + printf "\t((\t\t\t\t\t"; + for (i = 0; i < ncapints; i++) { + if (masks[i]) + printf "\t\\\n\t\t((x) >> 5) == %2d ? %s_MASK%d :", i, s, i; + } + printf " 0\t\\\n"; + printf "\t) & (1U << ((x) & 31)))\n\n"; + } + + printf "#endif /* _ASM_X86_CPUFEATUREMASKS_H */\n"; +} diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index e937be979ec86..5778bc4984153 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -29,9 +29,13 @@ static struct relocs relocs16; static struct relocs relocs32; #if ELF_BITS == 64 -static struct relocs relocs32neg; static struct relocs relocs64; # define FMT PRIu64 + +#ifndef R_X86_64_REX_GOTPCRELX +# define R_X86_64_REX_GOTPCRELX 42 +#endif + #else # define FMT PRIu32 #endif @@ -86,8 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 - "__per_cpu_load|" - "init_per_cpu__.*|" "__end_rodata_hpage_align|" #endif "_end)$" @@ -227,6 +229,7 @@ static const char *rel_type(unsigned type) REL_TYPE(R_X86_64_PC16), REL_TYPE(R_X86_64_8), REL_TYPE(R_X86_64_PC8), + REL_TYPE(R_X86_64_REX_GOTPCRELX), #else REL_TYPE(R_386_NONE), REL_TYPE(R_386_32), @@ -284,34 +287,6 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) return name; } -static Elf_Sym *sym_lookup(const char *symname) -{ - int i; - - for (i = 0; i < shnum; i++) { - struct section *sec = &secs[i]; - long nsyms; - char *strtab; - Elf_Sym *symtab; - Elf_Sym *sym; - - if (sec->shdr.sh_type != SHT_SYMTAB) - continue; - - nsyms = sec->shdr.sh_size/sizeof(Elf_Sym); - symtab = sec->symtab; - strtab = sec->link->strtab; - - for (sym = symtab; --nsyms >= 0; sym++) { - if (!sym->st_name) - continue; - if (strcmp(symname, strtab + sym->st_name) == 0) - return sym; - } - } - return 0; -} - #if BYTE_ORDER == LITTLE_ENDIAN # define le16_to_cpu(val) (val) # define le32_to_cpu(val) (val) @@ -760,84 +735,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, } } -/* - * The .data..percpu section is a special case for x86_64 SMP kernels. - * It is used to initialize the actual per_cpu areas and to provide - * definitions for the per_cpu variables that correspond to their offsets - * within the percpu area. Since the values of all of the symbols need - * to be offsets from the start of the per_cpu area the virtual address - * (sh_addr) of .data..percpu is 0 in SMP kernels. - * - * This means that: - * - * Relocations that reference symbols in the per_cpu area do not - * need further relocation (since the value is an offset relative - * to the start of the per_cpu area that does not change). - * - * Relocations that apply to the per_cpu area need to have their - * offset adjusted by by the value of __per_cpu_load to make them - * point to the correct place in the loaded image (because the - * virtual address of .data..percpu is 0). - * - * For non SMP kernels .data..percpu is linked as part of the normal - * kernel data and does not require special treatment. - * - */ -static int per_cpu_shndx = -1; -static Elf_Addr per_cpu_load_addr; - -static void percpu_init(void) -{ - int i; - - for (i = 0; i < shnum; i++) { - ElfW(Sym) *sym; - - if (strcmp(sec_name(i), ".data..percpu")) - continue; - - if (secs[i].shdr.sh_addr != 0) /* non SMP kernel */ - return; - - sym = sym_lookup("__per_cpu_load"); - if (!sym) - die("can't find __per_cpu_load\n"); - - per_cpu_shndx = i; - per_cpu_load_addr = sym->st_value; - - return; - } -} - #if ELF_BITS == 64 -/* - * Check to see if a symbol lies in the .data..percpu section. - * - * The linker incorrectly associates some symbols with the - * .data..percpu section so we also need to check the symbol - * name to make sure that we classify the symbol correctly. - * - * The GNU linker incorrectly associates: - * __init_begin - * __per_cpu_load - * - * The "gold" linker incorrectly associates: - * init_per_cpu__fixed_percpu_data - * init_per_cpu__gdt_page - */ -static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) -{ - int shndx = sym_index(sym); - - return (shndx == per_cpu_shndx) && - strcmp(symname, "__init_begin") && - strcmp(symname, "__per_cpu_load") && - strncmp(symname, "init_per_cpu_", 13); -} - - static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { @@ -848,12 +747,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, if (sym->st_shndx == SHN_UNDEF) return 0; - /* - * Adjust the offset if this reloc applies to the percpu section. - */ - if (sec->shdr.sh_info == per_cpu_shndx) - offset += per_cpu_load_addr; - switch (r_type) { case R_X86_64_NONE: /* NONE can be ignored. */ @@ -861,33 +754,23 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, case R_X86_64_PC32: case R_X86_64_PLT32: + case R_X86_64_REX_GOTPCRELX: /* - * PC relative relocations don't need to be adjusted unless - * referencing a percpu symbol. + * PC relative relocations don't need to be adjusted. * * NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32. */ - if (is_percpu_sym(sym, symname)) - add_reloc(&relocs32neg, offset); break; case R_X86_64_PC64: /* * Only used by jump labels */ - if (is_percpu_sym(sym, symname)) - die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname); break; case R_X86_64_32: case R_X86_64_32S: case R_X86_64_64: - /* - * References to the percpu area don't need to be adjusted. - */ - if (is_percpu_sym(sym, symname)) - break; - if (shn_abs) { /* * Whitelisted absolute symbols do not require @@ -1055,7 +938,8 @@ static int cmp_relocs(const void *va, const void *vb) static void sort_relocs(struct relocs *r) { - qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); + if (r->count) + qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } static int write32(uint32_t v, FILE *f) @@ -1099,7 +983,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Order the relocations for more efficient processing */ sort_relocs(&relocs32); #if ELF_BITS == 64 - sort_relocs(&relocs32neg); sort_relocs(&relocs64); #else sort_relocs(&relocs16); @@ -1131,13 +1014,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Now print each relocation */ for (i = 0; i < relocs64.count; i++) write_reloc(relocs64.offset[i], stdout); - - /* Print a stop */ - write_reloc(0, stdout); - - /* Now print each inverse 32-bit relocation */ - for (i = 0; i < relocs32neg.count; i++) - write_reloc(relocs32neg.offset[i], stdout); #endif /* Print a stop */ @@ -1190,9 +1066,6 @@ void process(FILE *fp, int use_real_mode, int as_text, read_symtabs(fp); read_relocs(fp); - if (ELF_BITS == 64) - percpu_init(); - if (show_absolute_syms) { print_absolute_symbols(); return; diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 42e74a5a7d786..fc473ca12c44d 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -198,7 +198,6 @@ static void __init __snp_fixup_e820_tables(u64 pa) pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); - e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); if (!memblock_is_region_reserved(pa, PMD_SIZE)) memblock_reserve(pa, PMD_SIZE); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 77e788e928cd4..98d8a50d2aed4 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,7 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d078de2c952b3..38971c6dcd4b7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6863d3da7decf..688ff59318ae2 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -70,7 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index b518f36d1ca2e..109af12f76472 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -51,6 +51,7 @@ SYM_FUNC_END(xen_hypercall_pv) * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) + ENDBR movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -90,6 +91,7 @@ SYM_FUNC_END(check_events) * then enter the hypervisor to get them handled. */ SYM_FUNC_START(xen_irq_enable_direct) + ENDBR FRAME_BEGIN /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) @@ -120,6 +122,7 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) + ENDBR testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah @@ -127,6 +130,7 @@ SYM_FUNC_START(xen_save_fl_direct) SYM_FUNC_END(xen_save_fl_direct) SYM_FUNC_START(xen_read_cr2) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX @@ -135,6 +139,7 @@ SYM_FUNC_START(xen_read_cr2) SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 894edf8d6d62f..5dad6c51cdc34 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -31,16 +31,14 @@ SYM_CODE_START(startup_xen) leaq __top_init_kernel_stack(%rip), %rsp - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax - cdq + xorl %eax, %eax + xorl %edx, %edx wrmsr mov %rsi, %rdi @@ -133,11 +131,13 @@ SYM_FUNC_START(xen_hypercall_hvm) SYM_FUNC_END(xen_hypercall_hvm) SYM_FUNC_START(xen_hypercall_amd) + ANNOTATE_NOENDBR vmmcall RET SYM_FUNC_END(xen_hypercall_amd) SYM_FUNC_START(xen_hypercall_intel) + ANNOTATE_NOENDBR vmcall RET SYM_FUNC_END(xen_hypercall_intel) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1675422016038..abd80dc135623 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7315,9 +7315,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) INIT_LIST_HEAD(&bfqd->dispatch); - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + hrtimer_setup(&bfqd->idle_slice_timer, bfq_idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); bfqd->queue_weights_tree = RB_ROOT_CACHED; #ifdef CONFIG_BFQ_GROUP_IOSCHED diff --git a/block/bio.c b/block/bio.c index f0c416e5931d9..6ac5983ba51e6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -77,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 65a1d4427ccf4..ed11438eb4be3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3004,8 +3004,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); - hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; diff --git a/block/blk-merge.c b/block/blk-merge.c index 15cd231d560cb..1d1589c352976 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -270,7 +270,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, unsigned max_segs, unsigned max_bytes) { - unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; + unsigned max_len = max_bytes - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; @@ -329,7 +329,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + bv.bv_offset + bv.bv_len <= lim->min_segment_size) { nsegs++; bytes += bv.bv_len; } else { @@ -556,11 +556,14 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, { struct req_iterator iter = { .bio = rq->bio, - .iter = rq->bio->bi_iter, }; struct phys_vec vec; int nsegs = 0; + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, diff --git a/block/blk-settings.c b/block/blk-settings.c index c44dadc35e1ec..b9c6f0ec1c499 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -246,6 +246,7 @@ int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + unsigned long seg_size; int err; /* @@ -303,7 +304,7 @@ int blk_validate_limits(struct queue_limits *lim) max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { - if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { @@ -341,7 +342,7 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* @@ -362,10 +363,17 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } + /* setup min segment size for building new segment in fast path */ + if (lim->seg_boundary_mask > lim->max_segment_size - 1) + seg_size = lim->max_segment_size; + else + seg_size = lim->seg_boundary_mask + 1; + lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 761ea662ddc34..0c77244a35c92 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -410,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } -static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, - sector_t sector) +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); @@ -437,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, return NULL; } +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = @@ -503,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk, zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } @@ -593,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } @@ -1040,6 +1056,47 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1096,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: - if (!bdev_emulates_zone_append(bdev)) + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); return false; + } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: @@ -1284,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, { unsigned int i; + atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); @@ -1338,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) } } + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; @@ -1550,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, } /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. */ - if (!disk->zone_wplugs_hash) + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccabf..9cf9a0099416d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,6 +14,7 @@ struct elevator_type; #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) +#define BLK_MIN_SEGMENT_SIZE 4096 /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -358,8 +359,12 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; + if (lim->chunk_sectors) + return true; + if (bio->bi_vcnt != 1) + return true; + return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > + lim->min_segment_size; } /** diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 5e9be13a56a82..7acba66eed481 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) out[size] = 0; while (i < size) { - u8 c = le16_to_cpu(in[i]) & 0xff; + u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; diff --git a/ci/diffs/.keep b/ci/diffs/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch new file mode 100644 index 0000000000000..3b6139225e7bf --- /dev/null +++ b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch @@ -0,0 +1,33 @@ +From 5440a12ac8fb2a8e051c597fcf5d85b427fe612a Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 13 Oct 2023 12:44:34 -0700 +Subject: [PATCH] Revert "bpf: Avoid unnecessary audit log for CPU security + mitigations" + +This reverts commit 236334aeec0f93217cf9235f2004e61a0a1a5985. +--- + include/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index f0891ba24cb1..61bde4520f5c 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void) + + static inline bool bpf_bypass_spec_v1(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + static inline bool bpf_bypass_spec_v4(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + int bpf_map_new_fd(struct bpf_map *map, int flags); +-- +2.34.1 + diff --git a/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch new file mode 100644 index 0000000000000..63bdd28adedd2 --- /dev/null +++ b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch @@ -0,0 +1,69 @@ +From c71766e8ff7a7f950522d25896fba758585500df Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 22 Apr 2024 21:14:40 -0700 +Subject: [PATCH] arch/Kconfig: Move SPECULATION_MITIGATIONS to arch/Kconfig + +SPECULATION_MITIGATIONS is currently defined only for x86. As a result, +IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) is always false for other +archs. f337a6a21e2f effectively set "mitigations=off" by default on +non-x86 archs, which is not desired behavior. Jakub observed this +change when running bpf selftests on s390 and arm64. + +Fix this by moving SPECULATION_MITIGATIONS to arch/Kconfig so that it is +available in all archs and thus can be used safely in kernel/cpu.c + +Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") +Cc: stable@vger.kernel.org +Cc: Sean Christopherson +Cc: Ingo Molnar +Cc: Daniel Sneddon +Cc: Jakub Kicinski +Signed-off-by: Song Liu +--- + arch/Kconfig | 10 ++++++++++ + arch/x86/Kconfig | 10 ---------- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 9f066785bb71..8f4af75005f8 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1609,4 +1609,14 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT + # strict alignment always, even with -falign-functions. + def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG + ++menuconfig SPECULATION_MITIGATIONS ++ bool "Mitigations for speculative execution vulnerabilities" ++ default y ++ help ++ Say Y here to enable options which enable mitigations for ++ speculative execution hardware vulnerabilities. ++ ++ If you say N, all mitigations will be disabled. You really ++ should know what you are doing to say so. ++ + endmenu +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 39886bab943a..50c890fce5e0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2486,16 +2486,6 @@ config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + +-menuconfig SPECULATION_MITIGATIONS +- bool "Mitigations for speculative execution vulnerabilities" +- default y +- help +- Say Y here to enable options which enable mitigations for +- speculative execution hardware vulnerabilities. +- +- If you say N, all mitigations will be disabled. You really +- should know what you are doing to say so. +- + if SPECULATION_MITIGATIONS + + config MITIGATION_PAGE_TABLE_ISOLATION +-- +2.43.0 + diff --git a/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch new file mode 100644 index 0000000000000..a13d767197413 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch @@ -0,0 +1,94 @@ +From fb9a697860acd8f54f2ba6647923794378eb33da Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sun, 26 Nov 2023 21:03:42 -0800 +Subject: [PATCH] bpf: Fix a few selftest failures due to llvm18 change + +With latest upstream llvm18, the following test cases failed: + + $ ./test_progs -j + #13/2 bpf_cookie/multi_kprobe_link_api:FAIL + #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL + #13 bpf_cookie:FAIL + #77 fentry_fexit:FAIL + #78/1 fentry_test/fentry:FAIL + #78 fentry_test:FAIL + #82/1 fexit_test/fexit:FAIL + #82 fexit_test:FAIL + #112/1 kprobe_multi_test/skel_api:FAIL + #112/2 kprobe_multi_test/link_api_addrs:FAIL + [...] + #112 kprobe_multi_test:FAIL + #356/17 test_global_funcs/global_func17:FAIL + #356 test_global_funcs:FAIL + +Further analysis shows llvm upstream patch [1] is responsible for the above +failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, +without [1], the asm code is: + + 0000000000000400 : + 400: f3 0f 1e fa endbr64 + 404: e8 00 00 00 00 callq 0x409 + 409: 48 89 f8 movq %rdi, %rax + 40c: c3 retq + 40d: 0f 1f 00 nopl (%rax) + +... and with [1], the asm code is: + + 0000000000005d20 : + 5d20: e8 00 00 00 00 callq 0x5d25 + 5d25: c3 retq + +... and is called instead of +and this caused test failures for #13/#77 etc. except #356. + +For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog +looks like: + + 0000000000000000 : + 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a + 1: 95 00 00 00 00 00 00 00 exit + +... which passed verification while the test itself expects a verification +failure. + +Let us add 'barrier_var' style asm code in both places to prevent function +specialization which caused selftests failure. + + [1] https://github.com/llvm/llvm-project/pull/72903 + +Signed-off-by: Yonghong Song +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev +--- + net/bpf/test_run.c | 2 +- + tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c +index c9fdcc5cdce1..711cf5d59816 100644 +--- a/net/bpf/test_run.c ++++ b/net/bpf/test_run.c +@@ -542,7 +542,7 @@ struct bpf_fentry_test_t { + + int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) + { +- asm volatile (""); ++ asm volatile ("": "+r"(arg)); + return (long)arg; + } + +diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c +index a32e11c7d933..5de44b09e8ec 100644 +--- a/tools/testing/selftests/bpf/progs/test_global_func17.c ++++ b/tools/testing/selftests/bpf/progs/test_global_func17.c +@@ -5,6 +5,7 @@ + + __noinline int foo(int *p) + { ++ barrier_var(p); + return p ? (*p = 42) : 0; + } + +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch new file mode 100644 index 0000000000000..5832a42664706 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch @@ -0,0 +1,67 @@ +From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Thu, 30 Nov 2023 18:46:40 -0800 +Subject: [PATCH 1/1] bpf: Fix a verifier bug due to incorrect branch offset + comparison with cpu=v4 + +Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 +("bpf: Support new 32bit offset jmp instruction") added support for new +32bit offset jmp instruction. Unfortunately, in function +bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset +(plus/minor a small delta) compares to 16-bit offset bound +[S16_MIN, S16_MAX], which caused the following verification failure: + $ ./test_progs-cpuv4 -t verif_scale_pyperf180 + ... + insn 10 cannot be patched due to 16-bit range + ... + libbpf: failed to load object 'pyperf180.bpf.o' + scale_test:FAIL:expect_success unexpected error: -12 (errno 12) + #405 verif_scale_pyperf180:FAIL + +Note that due to recent llvm18 development, the patch [2] (already applied +in bpf-next) needs to be applied to bpf tree for testing purpose. + +The fix is rather simple. For 32bit offset branch insn, the adjusted +offset compares to [S32_MIN, S32_MAX] and then verification succeeded. + + [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev + [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev + +Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev +--- + kernel/bpf/core.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index cd3afe57ece3..fe254ae035fe 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) + { +- const s32 off_min = S16_MIN, off_max = S16_MAX; ++ s64 off_min, off_max, off; + s32 delta = end_new - end_old; +- s32 off; + +- if (insn->code == (BPF_JMP32 | BPF_JA)) ++ if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; +- else ++ off_min = S32_MIN; ++ off_max = S32_MAX; ++ } else { + off = insn->off; ++ off_min = S16_MIN; ++ off_max = S16_MAX; ++ } + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch new file mode 100644 index 0000000000000..ea6b2386d0345 --- /dev/null +++ b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch @@ -0,0 +1,40 @@ +From patchwork Fri Aug 2 18:54:34 2024 +From: Yonghong Song +Subject: [PATCH bpf-next] selftests/bpf: Fix a btf_dump selftest failure + +Jakub reported bpf selftest "btf_dump" failure after forwarding to +v6.11-rc1 with netdev. + Error: #33 btf_dump + Error: #33/15 btf_dump/btf_dump: var_data + btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 + +The reason for the failure is due to + commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") +where percpu static variable "cpu_profile_flip" is removed. + +Let us replace "cpu_profile_flip" with a variable in bpf subsystem +so whenever that variable gets deleted or renamed, we can detect the +failure immediately. In this case, I picked a static percpu variable +"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. + +Reported-by: Jakub Kicinski +Signed-off-by: Yonghong Song +--- + tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +index 09a8e6f9b379..b293b8501fd6 100644 +--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c ++++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + #endif +- TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, +- "static int cpu_profile_flip = (int)2", 2); ++ TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, ++ "static int bpf_cgrp_storage_busy = (int)2", 2); + } + + static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, diff --git a/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch new file mode 100644 index 0000000000000..bd12bd9b3fba5 --- /dev/null +++ b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch @@ -0,0 +1,99 @@ +From c8268f8e9fa33c32e1f2f86fc7b703408a396c70 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 27 Oct 2023 11:24:24 -0700 +Subject: [PATCH] net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() + +With latest sync from net-next tree, bpf-next has a bpf selftest failure: + [root@arch-fb-vm1 bpf]# ./test_progs -t setget_sockopt + ... + [ 76.194349] ============================================ + [ 76.194682] WARNING: possible recursive locking detected + [ 76.195039] 6.6.0-rc7-g37884503df08-dirty #67 Tainted: G W OE + [ 76.195518] -------------------------------------------- + [ 76.195852] new_name/154 is trying to acquire lock: + [ 76.196159] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: ip_sock_set_tos+0x19/0x30 + [ 76.196669] + [ 76.196669] but task is already holding lock: + [ 76.197028] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.197517] + [ 76.197517] other info that might help us debug this: + [ 76.197919] Possible unsafe locking scenario: + [ 76.197919] + [ 76.198287] CPU0 + [ 76.198444] ---- + [ 76.198600] lock(sk_lock-AF_INET); + [ 76.198831] lock(sk_lock-AF_INET); + [ 76.199062] + [ 76.199062] *** DEADLOCK *** + [ 76.199062] + [ 76.199420] May be due to missing lock nesting notation + [ 76.199420] + [ 76.199879] 2 locks held by new_name/154: + [ 76.200131] #0: ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.200644] #1: ffffffff90f96a40 (rcu_read_lock){....}-{1:2}, at: __cgroup_bpf_run_filter_sock_ops+0x55/0x290 + [ 76.201268] + [ 76.201268] stack backtrace: + [ 76.201538] CPU: 4 PID: 154 Comm: new_name Tainted: G W OE 6.6.0-rc7-g37884503df08-dirty #67 + [ 76.202134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 + [ 76.202699] Call Trace: + [ 76.202858] + [ 76.203002] dump_stack_lvl+0x4b/0x80 + [ 76.203239] __lock_acquire+0x740/0x1ec0 + [ 76.203503] lock_acquire+0xc1/0x2a0 + [ 76.203766] ? ip_sock_set_tos+0x19/0x30 + [ 76.204050] ? sk_stream_write_space+0x12a/0x230 + [ 76.204389] ? lock_release+0xbe/0x260 + [ 76.204661] lock_sock_nested+0x32/0x80 + [ 76.204942] ? ip_sock_set_tos+0x19/0x30 + [ 76.205208] ip_sock_set_tos+0x19/0x30 + [ 76.205452] do_ip_setsockopt+0x4b3/0x1580 + [ 76.205719] __bpf_setsockopt+0x62/0xa0 + [ 76.205963] bpf_sock_ops_setsockopt+0x11/0x20 + [ 76.206247] bpf_prog_630217292049c96e_bpf_test_sockopt_int+0xbc/0x123 + [ 76.206660] bpf_prog_493685a3bae00bbd_bpf_test_ip_sockopt+0x49/0x4b + [ 76.207055] bpf_prog_b0bcd27f269aeea0_skops_sockopt+0x44c/0xec7 + [ 76.207437] __cgroup_bpf_run_filter_sock_ops+0xda/0x290 + [ 76.207829] __inet_listen_sk+0x108/0x1b0 + [ 76.208122] inet_listen+0x48/0x70 + [ 76.208373] __sys_listen+0x74/0xb0 + [ 76.208630] __x64_sys_listen+0x16/0x20 + [ 76.208911] do_syscall_64+0x3f/0x90 + [ 76.209174] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + ... + +Both ip_sock_set_tos() and inet_listen() calls lock_sock(sk) which +caused a dead lock. + +To fix the issue, use sockopt_lock_sock() in ip_sock_set_tos() +instead. sockopt_lock_sock() will avoid lock_sock() if it is in bpf +context. + +Fixes: 878d951c6712 ("inet: lock the socket in ip_sock_set_tos()") +Suggested-by: Martin KaFai Lau +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/bpf/20231027182424.1444845-1-yonghong.song@linux.dev +--- + net/ipv4/ip_sockglue.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 9c68b6b74d9f..2efc53526a38 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -602,9 +602,9 @@ void __ip_sock_set_tos(struct sock *sk, int val) + + void ip_sock_set_tos(struct sock *sk, int val) + { +- lock_sock(sk); ++ sockopt_lock_sock(sk); + __ip_sock_set_tos(sk, val); +- release_sock(sk); ++ sockopt_release_sock(sk); + } + EXPORT_SYMBOL(ip_sock_set_tos); + +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch new file mode 100644 index 0000000000000..da5bcdc455967 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch @@ -0,0 +1,51 @@ +From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 +From: Stanislav Fomichev +Date: Thu, 25 Jul 2024 14:40:29 -0700 +Subject: [PATCH 1/1] selftests/bpf: Filter out _GNU_SOURCE when compiling + test_cpp + +Jakub reports build failures when merging linux/master with net tree: + +CXX test_cpp +In file included from :454: +:2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] + 2 | #define _GNU_SOURCE + | ^ +:445:9: note: previous definition is here + 445 | #define _GNU_SOURCE 1 + +The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to +CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. +Apparently clang++ also unconditionally adds it for the C++ targets [0] +which causes a conflict. Add small change in the selftests makefile +to filter it out for test_cpp. + +Not sure which tree it should go via, targeting bpf for now, but net +might be better? + +0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off + +Signed-off-by: Stanislav Fomichev +Signed-off-by: Andrii Nakryiko +Acked-by: Jiri Olsa +Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me +--- + tools/testing/selftests/bpf/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index dd49c1d23a60..81d4757ecd4c 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + # Make sure we are able to include and link libbpf against c++. + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch new file mode 100644 index 0000000000000..4ebfe20b24707 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch @@ -0,0 +1,50 @@ +From f3d2080e8cf23125f79e345061149ae40f66816f Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 3 Jun 2024 23:43:17 -0700 +Subject: [PATCH bpf-next] selftests/bpf: Fix bpf_cookie and find_vma in nested + VM + +bpf_cookie and find_vma are flaky in nested VMs, which is used by some CI +systems. It turns out these failures are caused by unreliable perf event +in nested VM. Fix these by: + + 1. Use PERF_COUNT_SW_CPU_CLOCK in find_vma; + 2. Increase sample_freq in bpf_cookie. + +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- + tools/testing/selftests/bpf/prog_tests/find_vma.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +index 4407ea428e77..070c52c312e5 100644 +--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c ++++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +@@ -451,7 +451,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; +- attr.sample_freq = 1000; ++ attr.sample_freq = 10000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; +diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c +index 5165b38f0e59..f7619e0ade10 100644 +--- a/tools/testing/selftests/bpf/prog_tests/find_vma.c ++++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c +@@ -29,8 +29,8 @@ static int open_pe(void) + + /* create perf event */ + attr.size = sizeof(attr); +- attr.type = PERF_TYPE_HARDWARE; +- attr.config = PERF_COUNT_HW_CPU_CYCLES; ++ attr.type = PERF_TYPE_SOFTWARE; ++ attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch new file mode 100644 index 0000000000000..d55d2e7af8651 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch @@ -0,0 +1,78 @@ +From 100888fb6d8a185866b1520031ee7e3182b173de Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 10 Nov 2023 11:36:44 -0800 +Subject: [PATCH] selftests/bpf: Fix pyperf180 compilation failure with clang18 + +With latest clang18 (main branch of llvm-project repo), when building bpf selftests, + [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j + +The following compilation error happens: + fatal error: error in backend: Branch target out of insn range + ... + Stack dump: + 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi + -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter + /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include + -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf + -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o + 1. parser at end of file + 2. Code generation + ... + +The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay +since cpu=v4 supports 32-bit branch target offset. + +The above failure is due to upstream llvm patch [1] where some inlining behavior +are changed in clang18. + +To workaround the issue, previously all 180 loop iterations are fully unrolled. +The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid +unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the +compiler is clang18, the unrollng amount is unconditionally reduced. + + [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Tested-by: Alan Maguire +Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev +--- + tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c +index c39f559d3100..42c4a8b62e36 100644 +--- a/tools/testing/selftests/bpf/progs/pyperf180.c ++++ b/tools/testing/selftests/bpf/progs/pyperf180.c +@@ -1,4 +1,26 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (c) 2019 Facebook + #define STACK_MAX_LEN 180 ++ ++/* llvm upstream commit at clang18 ++ * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e ++ * changed inlining behavior and caused compilation failure as some branch ++ * target distance exceeded 16bit representation which is the maximum for ++ * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 ++ * to specify which cpu version is used for compilation. So a smaller ++ * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which ++ * reduced some branch target distances and resolved the compilation failure. ++ * ++ * To capture the case where a developer/ci uses clang18 but the corresponding ++ * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count ++ * will be set as well to prevent potential compilation failures. ++ */ ++#ifdef __BPF_CPU_VERSION__ ++#if __BPF_CPU_VERSION__ < 4 ++#define UNROLL_COUNT 90 ++#endif ++#elif __clang_major__ == 18 ++#define UNROLL_COUNT 90 ++#endif ++ + #include "pyperf.h" +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch new file mode 100644 index 0000000000000..6497a6cc38c90 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch @@ -0,0 +1,41 @@ +From 42839864a62ee244ec280b09149b1cb439f681db Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Fri, 27 Oct 2023 18:25:39 -0700 +Subject: [PATCH bpf-next] selftests/bpf: disable detection of llvm when + building bpftool + +The VMs in which we run the selftests do not have llvm installed. +We build selftests/bpftool in a host that have llvm. +bpftool currently will use llvm first and fallback to libbfd but there +is no way to disable detection from the command line. + +Removing it from the feature detection should force us to use libbfd. + +Signed-off-by: Manu Bretelle +--- + tools/bpf/bpftool/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile +index e9154ace80ff..01314458e25e 100644 +--- a/tools/bpf/bpftool/Makefile ++++ b/tools/bpf/bpftool/Makefile +@@ -95,7 +95,6 @@ RM ?= rm -f + FEATURE_USER = .bpftool + + FEATURE_TESTS := clang-bpf-co-re +-FEATURE_TESTS += llvm + FEATURE_TESTS += libcap + FEATURE_TESTS += libbfd + FEATURE_TESTS += libbfd-liberty +@@ -104,7 +103,6 @@ FEATURE_TESTS += disassembler-four-args + FEATURE_TESTS += disassembler-init-styled + + FEATURE_DISPLAY := clang-bpf-co-re +-FEATURE_DISPLAY += llvm + FEATURE_DISPLAY += libcap + FEATURE_DISPLAY += libbfd + FEATURE_DISPLAY += libbfd-liberty +-- +2.39.3 + diff --git a/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch new file mode 100644 index 0000000000000..3fa007c51db68 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch @@ -0,0 +1,32 @@ +From 0daad0a615e687e1247230f3d0c31ae60ba32314 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 May 2024 15:29:38 -0700 +Subject: [PATCH bpf-next] selftests/bpf: fix inet_csk_accept prototype in + test_sk_storage_tracing.c + +Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt +progs/test_sk_storage_tracing.c to take that into account. + + [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +index 02e718f06e0f..40531e56776e 100644 +--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +@@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) + } + + SEC("fexit/inet_csk_accept") +-int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, ++int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, + struct sock *accepted_sk) + { + set_task_info(accepted_sk); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch new file mode 100644 index 0000000000000..ec1e29a8ab974 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch @@ -0,0 +1,31 @@ +From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Mon, 11 Dec 2023 17:09:38 -0800 +Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness + +Work around the issue while we deal with it in the Clang itself. +See [0]. + + [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/iters.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c +index 3aca3dc145b5..929ba6fa2105 100644 +--- a/tools/testing/selftests/bpf/progs/iters.c ++++ b/tools/testing/selftests/bpf/progs/iters.c +@@ -1420,7 +1420,7 @@ SEC("raw_tp") + __success + int iter_arr_with_actual_elem_count(const void *ctx) + { +- int i, n = loop_data.n, sum = 0; ++ unsigned i, n = loop_data.n, sum = 0; + + if (n > ARRAY_SIZE(loop_data.data)) + return 0; +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 0000000000000..e631fac0cc698 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..19d269de7e8ca --- /dev/null +++ b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,142 @@ +From 3772e6cdb51f21a11df2acf6aa431cc8b9137bfb Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:09 +0100 +Subject: [PATCH 1/2] tools/resolve_btfids: Refactor set sorting with types + from btf_ids.h + +Instead of using magic offsets to access BTF ID set data, leverage types +from btf_ids.h (btf_id_set and btf_id_set8) which define the actual +layout of the data. Thanks to this change, set sorting should also +continue working if the layout changes. + +This requires to sync the definition of 'struct btf_id_set8' from +include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync +the rest of the file at the moment, b/c that would require to also sync +multiple dependent headers and we don't need any other defs from +btf_ids.h. + +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Acked-by: Daniel Xu +Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- + tools/include/linux/btf_ids.h | 9 +++++++++ + 2 files changed, 30 insertions(+), 14 deletions(-) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 27a23196d58e..32634f00abba 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ + #include + + #define BTF_IDS_SECTION ".BTF_ids" +-#define BTF_ID "__BTF_ID__" ++#define BTF_ID_PREFIX "__BTF_ID__" + + #define BTF_STRUCT "struct" + #define BTF_UNION "union" +@@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) + + static bool is_btf_id(const char *name) + { +- return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); ++ return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); + } + + static struct btf_id *btf_id__find(struct rb_root *root, const char *name) +@@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) + * __BTF_ID__TYPE__vfs_truncate__0 + * prefix = ^ + */ +- prefix = name + sizeof(BTF_ID) - 1; ++ prefix = name + sizeof(BTF_ID_PREFIX) - 1; + + /* struct */ + if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { +@@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) + static int sets_patch(struct object *obj) + { + Elf_Data *data = obj->efile.idlist; +- int *ptr = data->d_buf; + struct rb_node *next; + + next = rb_first(&obj->sets); + while (next) { +- unsigned long addr, idx; ++ struct btf_id_set8 *set8; ++ struct btf_id_set *set; ++ unsigned long addr, off; + struct btf_id *id; +- int *base; +- int cnt; + + id = rb_entry(next, struct btf_id, rb_node); + addr = id->addr[0]; +- idx = addr - obj->efile.idlist_addr; ++ off = addr - obj->efile.idlist_addr; + + /* sets are unique */ + if (id->addr_cnt != 1) { +@@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) + return -1; + } + +- idx = idx / sizeof(int); +- base = &ptr[idx] + (id->is_set8 ? 2 : 1); +- cnt = ptr[idx]; ++ if (id->is_set) { ++ set = data->d_buf + off; ++ qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); ++ } else { ++ set8 = data->d_buf + off; ++ /* ++ * Make sure id is at the beginning of the pairs ++ * struct, otherwise the below qsort would not work. ++ */ ++ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); ++ qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +- (idx + 1) * sizeof(int), cnt, id->name); +- +- qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); ++ off, id->is_set ? set->cnt : set8->cnt, id->name); + + next = rb_next(next); + } +diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h +index 2f882d5cb30f..72535f00572f 100644 +--- a/tools/include/linux/btf_ids.h ++++ b/tools/include/linux/btf_ids.h +@@ -8,6 +8,15 @@ struct btf_id_set { + u32 ids[]; + }; + ++struct btf_id_set8 { ++ u32 cnt; ++ u32 flags; ++ struct { ++ u32 id; ++ u32 flags; ++ } pairs[]; ++}; ++ + #ifdef CONFIG_DEBUG_INFO_BTF + + #include /* for __PASTE */ +-- +2.39.3 + + + diff --git a/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch new file mode 100644 index 0000000000000..24ebc231056cb --- /dev/null +++ b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch @@ -0,0 +1,65 @@ +From 08969a676d234a178ff9f8c67936a2ad98a741eb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 27 Oct 2023 16:22:24 -0700 +Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at + modules as well + +Recent changes to count number of matching symbols when creating +a kprobe event failed to take into account kernel modules. As such, it +breaks kprobes on kernel module symbols, by assuming there is no match. + +Fix this my calling module_kallsyms_on_each_symbol() in addition to +kallsyms_on_each_match_symbol() to perform a proper counting. + +Cc: Francis Laniel +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Steven Rostedt +Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols") +Signed-off-by: Andrii Nakryiko +--- + kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index effcaede4759..1efb27f35963 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused) + return 0; + } + ++struct sym_count_ctx { ++ unsigned int count; ++ const char *name; ++}; ++ ++static int count_mod_symbols(void *data, const char *name, unsigned long unused) ++{ ++ struct sym_count_ctx *ctx = data; ++ ++ if (strcmp(name, ctx->name) == 0) ++ ctx->count++; ++ ++ return 0; ++} ++ + static unsigned int number_of_same_symbols(char *func_name) + { +- unsigned int count; ++ struct sym_count_ctx ctx = { .count = 0, .name = func_name }; ++ ++ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + +- count = 0; +- kallsyms_on_each_match_symbol(count_symbols, func_name, &count); ++ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + +- return count; ++ return ctx.count; + } + + static int __trace_kprobe_create(int argc, const char *argv[]) +-- +2.34.1 + diff --git a/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..c4d67693bd132 --- /dev/null +++ b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,117 @@ +From c3dcadfdf2bf8f01471066700c098b5185240df6 Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:10 +0100 +Subject: [PATCH 2/2] tools/resolve_btfids: Fix cross-compilation to non-host + endianness + +The .BTF_ids section is pre-filled with zeroed BTF ID entries during the +build and afterwards patched by resolve_btfids with correct values. +Since resolve_btfids always writes in host-native endianness, it relies +on libelf to do the translation when the target ELF is cross-compiled to +a different endianness (this was introduced in commit 61e8aeda9398 +("bpf: Fix libelf endian handling in resolv_btfids")). + +Unfortunately, the translation will corrupt the flags fields of SET8 +entries because these were written during vmlinux compilation and are in +the correct endianness already. This will lead to numerous selftests +failures such as: + + $ sudo ./test_verifier 502 502 + #502/p sleepable fentry accept FAIL + Failed to load prog 'Invalid argument'! + bpf_fentry_test1 is not sleepable + verification time 34 usec + stack depth 0 + processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + Summary: 0 PASSED, 0 SKIPPED, 1 FAILED + +Since it's not possible to instruct libelf to translate just certain +values, let's manually bswap the flags (both global and entry flags) in +resolve_btfids when needed, so that libelf then translates everything +correctly. + +Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 32634f00abba..d9520cb826b3 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -90,6 +90,14 @@ + + #define ADDR_CNT 100 + ++#if __BYTE_ORDER == __LITTLE_ENDIAN ++# define ELFDATANATIVE ELFDATA2LSB ++#elif __BYTE_ORDER == __BIG_ENDIAN ++# define ELFDATANATIVE ELFDATA2MSB ++#else ++# error "Unknown machine endianness!" ++#endif ++ + struct btf_id { + struct rb_node rb_node; + char *name; +@@ -117,6 +125,7 @@ struct object { + int idlist_shndx; + size_t strtabidx; + unsigned long idlist_addr; ++ int encoding; + } efile; + + struct rb_root sets; +@@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) + { + Elf_Scn *scn = NULL; + size_t shdrstrndx; ++ GElf_Ehdr ehdr; + int idx = 0; + Elf *elf; + int fd; +@@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) + return -1; + } + ++ if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { ++ pr_err("FAILED cannot get ELF header: %s\n", ++ elf_errmsg(-1)); ++ return -1; ++ } ++ obj->efile.encoding = ehdr.e_ident[EI_DATA]; ++ + /* + * Scan all the elf sections and look for save data + * from .BTF_ids section and symbols. +@@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ ++ /* ++ * When ELF endianness does not match endianness of the ++ * host, libelf will do the translation when updating ++ * the ELF. This, however, corrupts SET8 flags which are ++ * already in the target endianness. So, let's bswap ++ * them to the host endianness and libelf will then ++ * correctly translate everything. ++ */ ++ if (obj->efile.encoding != ELFDATANATIVE) { ++ int i; ++ ++ set8->flags = bswap_32(set8->flags); ++ for (i = 0; i < set8->cnt; i++) { ++ set8->pairs[i].flags = ++ bswap_32(set8->pairs[i].flags); ++ } ++ } + } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +-- +2.39.3 + diff --git a/ci/diffs/0099-s390x_nolockdep.diff b/ci/diffs/0099-s390x_nolockdep.diff new file mode 100644 index 0000000000000..44c2d1a520656 --- /dev/null +++ b/ci/diffs/0099-s390x_nolockdep.diff @@ -0,0 +1,48 @@ +From 470d0c7874ac638ea62cddc3a20ec047fa4ab539 Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Wed, 14 Feb 2024 17:25:35 -0800 +Subject: [PATCH] bpf/selftests: disable lockdep on s390x + +Tests are slow to run on s390x, this should make them faster. + +Signed-off-by: Manu Bretelle +--- + tools/testing/selftests/bpf/config.s390x | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x +index 706931a8c2c69..67bfd62b0b582 100644 +--- a/tools/testing/selftests/bpf/config.s390x ++++ b/tools/testing/selftests/bpf/config.s390x +@@ -23,11 +23,11 @@ CONFIG_CPUSETS=y + CONFIG_CRASH_DUMP=y + CONFIG_CRYPTO_USER_API_RNG=y + CONFIG_CRYPTO_USER_API_SKCIPHER=y +-CONFIG_DEBUG_ATOMIC_SLEEP=y ++CONFIG_DEBUG_ATOMIC_SLEEP=n + CONFIG_DEBUG_INFO_BTF=y + CONFIG_DEBUG_INFO_DWARF4=y + CONFIG_DEBUG_LIST=y +-CONFIG_DEBUG_LOCKDEP=y ++CONFIG_DEBUG_LOCKDEP=n + CONFIG_DEBUG_NOTIFIERS=y + CONFIG_DEBUG_PAGEALLOC=y + CONFIG_DEBUG_SECTION_MISMATCH=y +@@ -71,7 +71,7 @@ CONFIG_KRETPROBES=y + CONFIG_KSM=y + CONFIG_LATENCYTOP=y + CONFIG_LIVEPATCH=y +-CONFIG_LOCK_STAT=y ++CONFIG_LOCK_STAT=n + CONFIG_MACVLAN=y + CONFIG_MACVTAP=y + CONFIG_MAGIC_SYSRQ=y +@@ -101,7 +101,7 @@ CONFIG_PCI=y + CONFIG_POSIX_MQUEUE=y + CONFIG_PROC_KCORE=y + CONFIG_PROFILING=y +-CONFIG_PROVE_LOCKING=y ++CONFIG_PROVE_LOCKING=n + CONFIG_PTDUMP_DEBUGFS=y + CONFIG_RC_DEVICES=y + CONFIG_RC_LOOPBACK=y diff --git a/ci/diffs/0099-selftest-cross-compile.diff b/ci/diffs/0099-selftest-cross-compile.diff new file mode 100644 index 0000000000000..e8732596bdb3f --- /dev/null +++ b/ci/diffs/0099-selftest-cross-compile.diff @@ -0,0 +1,13 @@ +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index a38a3001527c..af68528cc944 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -304,7 +304,7 @@ $(OUTPUT)/test_maps: $(TESTING_HELPERS) + $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) + $(OUTPUT)/xsk.o: $(BPFOBJ) + +-BPFTOOL ?= $(DEFAULT_BPFTOOL) ++BPFTOOL ?= $(TRUNNER_BPFTOOL) + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ diff --git a/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch new file mode 100644 index 0000000000000..b81d22a35322c --- /dev/null +++ b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch @@ -0,0 +1,46 @@ +From 0d24852bd71ec85ca0016b6d6fc997e6a3381552 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Mon, 30 Sep 2024 11:55:00 -0700 +Subject: [PATCH] iov_iter: fix advancing slot in iter_folioq_get_pages() + +iter_folioq_get_pages() decides to advance to the next folioq slot when +it has reached the end of the current folio. However, it is checking +offset, which is the beginning of the current part, instead of +iov_offset, which is adjusted to the end of the current part, so it +doesn't advance the slot when it's supposed to. As a result, on the next +iteration, we'll use the same folio with an out-of-bounds offset and +return an unrelated page. + +This manifested as various crashes and other failures in 9pfs in drgn's +VM testing setup and BPF CI. + +Fixes: db0aa2e9566f ("mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios") +Link: https://lore.kernel.org/linux-fsdevel/20240923183432.1876750-1-chantr4@gmail.com/ +Tested-by: Manu Bretelle +Signed-off-by: Omar Sandoval +Link: https://lore.kernel.org/r/cbaf141ba6c0e2e209717d02746584072844841a.1727722269.git.osandov@fb.com +Tested-by: Eduard Zingerman +Tested-by: Leon Romanovsky +Tested-by: Joey Gouly +Acked-by: David Howells +Signed-off-by: Christian Brauner +--- + lib/iov_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index 97003155b..1abb32c0d 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -1033,7 +1033,7 @@ static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + if (maxpages == 0 || extracted >= maxsize) + break; + +- if (offset >= fsize) { ++ if (iov_offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { +-- +2.34.1 + diff --git a/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch new file mode 100644 index 0000000000000..11aa3626f5a81 --- /dev/null +++ b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch @@ -0,0 +1,58 @@ +From affb32e4f056883f285f8535b766293b85752fb4 Mon Sep 17 00:00:00 2001 +From: Jiri Olsa +Date: Tue, 24 Sep 2024 13:07:30 +0200 +Subject: [PATCH] selftests/bpf: Fix uprobe consumer test + +With newly merged code the uprobe behaviour is slightly different +and affects uprobe consumer test. + +We no longer need to check if the uprobe object is still preserved +after removing last uretprobe, because it stays as long as there's +pending/installed uretprobe instance. + +This allows to run uretprobe consumers registered 'after' uprobe was +hit even if previous uretprobe got unregistered before being hit. + +The uprobe object will be now removed after the last uprobe ref is +released and in such case it's held by ri->uprobe (return instance) +which is released after the uretprobe is hit. + +Reported-by: Ihor Solodrai +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Tested-by: Ihor Solodrai +Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/ +--- + .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +index 844f6fc8487b..c1ac813ff9ba 100644 +--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c ++++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +@@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel, + fmt = "prog 0/1: uprobe"; + } else { + /* +- * uprobe return is tricky ;-) +- * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask +- * +- * in addition if 'after' state removes everything that was installed in +- * 'before' state, then uprobe kernel object goes away and return uprobe +- * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ +- unsigned long probe_preserved = before & after; /* did uprobe go away */ + +- if (had_uretprobes && probe_preserved && test_bit(idx, after)) ++ if (had_uretprobes && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } +-- +2.34.1 + diff --git a/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch new file mode 100644 index 0000000000000..ba37429396236 --- /dev/null +++ b/ci/diffs/0399-selftests-sched_ext-fix-build-after-renames-in-sched.patch @@ -0,0 +1,231 @@ +From 5565144e82b97c5d2082ab19866836dfe5b2e592 Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Thu, 21 Nov 2024 13:20:46 -0800 +Subject: [PATCH] selftests/sched_ext: fix build after renames in sched_ext API + +The selftests are falining to build on current tip of bpf-next and +sched_ext [1]. This has broken BPF CI [2] after merge from upstream. + +Use appropriate function names in the selftests according to the +recent changes in the sched_ext API [3]. + +[1] +https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=fc39fb56917bb3cb53e99560ca3612a84456ada2 +[2] https://github.com/kernel-patches/bpf/actions/runs/11959327258/job/33340923745 +[3] https://lore.kernel.org/all/20241109194853.580310-1-tj@kernel.org/ + +Signed-off-by: Ihor Solodrai +--- + .../testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c | 2 +- + .../selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- + .../selftests/sched_ext/enq_select_cpu_fails.bpf.c | 2 +- + tools/testing/selftests/sched_ext/exit.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/maximal.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c | 2 +- + .../testing/selftests/sched_ext/select_cpu_dispatch.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c | 2 +- + .../selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c | 4 ++-- + tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 8 ++++---- + 12 files changed, 19 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +index 37d9bf6fb745..6f4c3f5a1c5d 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ +- scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, ++ scx_bpf_dsq_insert_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } +diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +index dffc97d9cdf1..e4a55027778f 100644 +--- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c ++++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c +@@ -17,8 +17,8 @@ s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ +- scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, +- p->scx.dsq_vtime, 0); ++ scx_bpf_dsq_insert_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, ++ p->scx.dsq_vtime, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +index 6a7db1502c29..6325bf76f47e 100644 +--- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c ++++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +@@ -45,7 +45,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) + + target = bpf_get_prandom_u32() % nr_cpus; + +- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } + +diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +index 1efb50d61040..a7cf868d5e31 100644 +--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c ++++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +@@ -31,7 +31,7 @@ void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c +index d75d4faf07f6..4bc36182d3ff 100644 +--- a/tools/testing/selftests/sched_ext/exit.bpf.c ++++ b/tools/testing/selftests/sched_ext/exit.bpf.c +@@ -33,7 +33,7 @@ void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) + if (exit_point == EXIT_ENQUEUE) + EXIT_CLEANLY(); + +- scx_bpf_dispatch(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) +@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) + if (exit_point == EXIT_DISPATCH) + EXIT_CLEANLY(); + +- scx_bpf_consume(DSQ_ID); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4d4cd8d966db..4c005fa71810 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -20,7 +20,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +28,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_consume(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +index f171ac470970..13d0f5be788d 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c +@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + } + scx_bpf_put_idle_cpumask(idle_mask); + +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + + SEC(".struct_ops.link") +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +index 9efdbb7da928..815f1d5d61ac 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c +@@ -67,7 +67,7 @@ void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + saw_local = true; + } + +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); + } + + s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +index 59bfc4f36167..4bb99699e920 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c +@@ -29,7 +29,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + cpu = prev_cpu; + + dispatch: +- scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; + } + +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +index 3bbd5fcdfb18..2a75de11b2cf 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c +@@ -18,7 +18,7 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching to a random DSQ should fail. */ +- scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +index 0fda57fe0ecf..99d075695c97 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c +@@ -18,8 +18,8 @@ s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p + s32 prev_cpu, u64 wake_flags) + { + /* Dispatching twice in a row is disallowed. */ +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); +- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); ++ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; + } +diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +index e6c67bcf5e6e..bfcb96cd4954 100644 +--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c ++++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +@@ -2,8 +2,8 @@ + /* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from +- * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and +- * making the test a very basic vtime scheduler. ++ * ops.select_cpu(). We validate this by using scx_bpf_dsq_insert_vtime(), ++ * and making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet +@@ -47,13 +47,13 @@ s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); + ddsp: +- scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); ++ scx_bpf_dsq_insert_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; + } + + void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) + { +- if (scx_bpf_consume(VTIME_DSQ)) ++ if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) + consumed = true; + } + +-- +2.47.0 + diff --git a/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch new file mode 100644 index 0000000000000..1f95c3c237cb9 --- /dev/null +++ b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch @@ -0,0 +1,61 @@ +From 80a5958a52b86e39c1a1bf5f4702011c0cf6ab4f Mon Sep 17 00:00:00 2001 +From: Eduard Zingerman +Date: Mon, 2 Dec 2024 12:14:46 -0800 +Subject: [PATCH] samples/bpf: fix samples compilation + +Commit [0] breaks samples build. + +TODO: moar details here + +[0] 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") + +Signed-off-by: Eduard Zingerman +Signed-off-by: Ihor Solodrai +--- + samples/bpf/Makefile | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile +index bcf103a4c14f..ee10dbf1b471 100644 +--- a/samples/bpf/Makefile ++++ b/samples/bpf/Makefile +@@ -146,13 +146,14 @@ ifeq ($(ARCH), x86) + BPF_EXTRA_CFLAGS += -fcf-protection + endif + +-TPROGS_CFLAGS += -Wall -O2 +-TPROGS_CFLAGS += -Wmissing-prototypes +-TPROGS_CFLAGS += -Wstrict-prototypes +-TPROGS_CFLAGS += $(call try-run,\ ++COMMON_CFLAGS += -Wall -O2 ++COMMON_CFLAGS += -Wmissing-prototypes ++COMMON_CFLAGS += -Wstrict-prototypes ++COMMON_CFLAGS += $(call try-run,\ + printf "int main() { return 0; }" |\ + $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,) + ++TPROGS_CFLAGS += $(COMMON_CFLAGS) + TPROGS_CFLAGS += -I$(objtree)/usr/include + TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ + TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +@@ -229,7 +230,7 @@ clean: + + $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) + # Fix up variables inherited from Kbuild that tools/ build system won't like +- $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ ++ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers +@@ -305,7 +306,7 @@ $(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check= + -include $(BPF_SAMPLES_PATH)/Makefile.target + + VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ +- $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ ++ $(abspath $(if $(objtree),$(objtree)/vmlinux)) \ + $(abspath ./vmlinux) + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +-- +2.47.0 + diff --git a/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch new file mode 100644 index 0000000000000..7e38b1a151a10 --- /dev/null +++ b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch @@ -0,0 +1,46 @@ +From faf291ff4beaef8dedebd166f11f815cdee257dc Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:37 +0900 +Subject: [PATCH 2000/2001] s390: fgraph: Fix to remove + ftrace_test_recursion_trylock() + +Fix to remove ftrace_test_recursion_trylock() from ftrace_graph_func() +because commit d576aec24df9 ("fgraph: Get ftrace recursion lock in +function_graph_enter") has been moved it to function_graph_enter_regs() +already. + +Reported-by: Jiri Olsa +Closes: https://lore.kernel.org/all/Z5O0shrdgeExZ2kF@krava/ +Fixes: d576aec24df9 ("fgraph: Get ftrace recursion lock in function_graph_enter") +Signed-off-by: Masami Hiramatsu (Google) +Tested-by: Jiri Olsa +--- + arch/s390/kernel/ftrace.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c +index c0b2c97efefb..63ba6306632e 100644 +--- a/arch/s390/kernel/ftrace.c ++++ b/arch/s390/kernel/ftrace.c +@@ -266,18 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) + { + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; +- int bit; + + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; +- bit = ftrace_test_recursion_trylock(ip, *parent); +- if (bit < 0) +- return; + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = (unsigned long)&return_to_handler; +- ftrace_test_recursion_unlock(bit); + } + + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +-- +2.48.1 + diff --git a/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch new file mode 100644 index 0000000000000..66483206f448f --- /dev/null +++ b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch @@ -0,0 +1,28 @@ +From 04fce0d606f59a62105729094013c4784492ec7b Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:48 +0900 +Subject: [PATCH 2001/2001] s390: tracing: Define ftrace_get_symaddr() for s390 + +Add ftrace_get_symaddr() for s390, which returns the symbol address +from ftrace's 'ip' parameter. + +Signed-off-by: Masami Hiramatsu (Google) +--- + arch/s390/include/asm/ftrace.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h +index a3b73a4f626e..185331e91f83 100644 +--- a/arch/s390/include/asm/ftrace.h ++++ b/arch/s390/include/asm/ftrace.h +@@ -51,6 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) + { + return addr; + } ++#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) + + #include + +-- +2.48.1 + diff --git a/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch new file mode 100644 index 0000000000000..9b24de70e2846 --- /dev/null +++ b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch @@ -0,0 +1,75 @@ +From f44275e7155dc310d36516fc25be503da099781c Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Mon, 6 Jan 2025 20:17:31 +0000 +Subject: [PATCH] selftests/bpf: add -fno-strict-aliasing to BPF_CFLAGS + +Following the discussion at [1], set -fno-strict-aliasing flag for all +BPF object build rules. Remove now unnecessary -CFLAGS variables. + +[1] https://lore.kernel.org/bpf/20250106185447.951609-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Acked-by: Eduard Zingerman +Link: https://lore.kernel.org/r/20250106201728.1219791-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 28 +--------------------------- + 1 file changed, 1 insertion(+), 27 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index eb4d21651aa7..d5be2f94deef 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -54,21 +54,6 @@ PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) + LDLIBS += $(PCAP_LIBS) + CFLAGS += $(PCAP_CFLAGS) + +-# The following tests perform type punning and they may break strict +-# aliasing rules, which are exploited by both GCC and clang by default +-# while optimizing. This can lead to broken programs. +-progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing +-progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing +-progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing +-progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/syscall.c-CFLAGS := -fno-strict-aliasing +-progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing +-progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing +-progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +-progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +-progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +- + # Some utility functions use LLVM libraries + jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + +@@ -103,18 +88,6 @@ progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +-# The following tests do type-punning, via the __imm_insn macro, from +-# `struct bpf_insn' to long and then uses the value. This triggers an +-# "is used uninitialized" warning in GCC due to strict-aliasing +-# rules. +-progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing + endif + + ifneq ($(CLANG_CPUV4),) +@@ -474,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare + +-- +2.47.1 + diff --git a/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch new file mode 100644 index 0000000000000..127b2641dc223 --- /dev/null +++ b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch @@ -0,0 +1,63 @@ +From bab18c7db44d3aa6c84450095451580922359c7a Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Tue, 7 Jan 2025 23:58:18 +0000 +Subject: [PATCH] selftests/bpf: add -std=gnu11 to BPF_CFLAGS and CFLAGS + +Latest versions of GCC BPF use C23 standard by default. This causes +compilation errors in vmlinux.h due to bool types declarations. + +Add -std=gnu11 to BPF_CFLAGS and CFLAGS. This aligns with the version +of the standard used when building the kernel currently [1]. + +For more details see the discussions at [2] and [3]. + +[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile#n465 +[2] https://lore.kernel.org/bpf/EYcXjcKDCJY7Yb0GGtAAb7nLKPEvrgWdvWpuNzXm2qi6rYMZDixKv5KwfVVMBq17V55xyC-A1wIjrqG3aw-Imqudo9q9X7D7nLU2gWgbN0w=@pm.me/ +[3] https://lore.kernel.org/bpf/20250106202715.1232864-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Link: https://lore.kernel.org/r/20250107235813.2964472-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index d5be2f94deef..ea9cee5de0f8 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -41,7 +41,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + +-CFLAGS += -g $(OPT_FLAGS) -rdynamic \ ++CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ + -Wall -Werror -fno-omit-frame-pointer \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ +@@ -447,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -std=gnu11 \ + -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare +@@ -787,9 +788,12 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ + + # Make sure we are able to include and link libbpf against c++. ++CXXFLAGS += $(CFLAGS) ++CXXFLAGS := $(subst -D_GNU_SOURCE=,,$(CXXFLAGS)) ++CXXFLAGS := $(subst -std=gnu11,-std=gnu++11,$(CXXFLAGS)) + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(CXXFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.47.1 + diff --git a/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch new file mode 100644 index 0000000000000..31bc3fb4717ec --- /dev/null +++ b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch @@ -0,0 +1,100 @@ +From a99b3ef16d4473f8f9b7547f89791d037112a7e6 Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Wed, 12 Feb 2025 22:24:01 +0000 +Subject: [PATCH] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all + subreqs queued + +Due to the code that queues a subreq on the active subrequest list getting +moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set +before the list-add actually happens. This is not a problem if the +collection worker happens after the list-add, but it's a race - and, for +9P, where the read from the server is synchronous and done in the +submitting thread, this is a lot more likely. + +The result is that, if the timing is wrong, a ref gets leaked because the +collector thinks that all the subreqs have completed (because it can't see +the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the +collection worker no longer goes into the collector. + +This can be provoked with AFS by injecting an msleep() right before the +final subreq is queued. + +Fix this by splitting the queuing part out of netfs_issue_read() into a new +function, netfs_queue_read(), and calling it separately. The setting of +NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is +holding the spinlock (that's probably unnecessary, but shouldn't hurt). + +It might be better to set a flag on the final subreq, but this could be a +problem if an error occurs and we can't queue it. + +Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ +Signed-off-by: David Howells +Tested-by: Ihor Solodrai +cc: Eric Van Hensbergen +cc: Latchesar Ionkov +cc: Dominique Martinet +cc: Christian Schoenebeck +cc: Marc Dionne +cc: Steve French +cc: Paulo Alcantara +cc: Jeff Layton +cc: v9fs@lists.linux.dev +cc: linux-cifs@vger.kernel.org +cc: netfs@lists.linux.dev +cc: linux-fsdevel@vger.kernel.org +--- + fs/netfs/buffered_read.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c +index f761d44b3436..0d1b6d35ff3b 100644 +--- a/fs/netfs/buffered_read.c ++++ b/fs/netfs/buffered_read.c +@@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + netfs_cache_read_terminated, subreq); + } + +-static void netfs_issue_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) ++static void netfs_queue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ bool last_subreq) + { + struct netfs_io_stream *stream = &rreq->io_streams[0]; + +@@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, + } + } + ++ if (last_subreq) { ++ smp_wmb(); /* Write lists before ALL_QUEUED. */ ++ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); ++ } ++ + spin_unlock(&rreq->lock); ++} + ++static void netfs_issue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + rreq->netfs_ops->issue_read(subreq); +@@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) + } + size -= slice; + start += slice; +- if (size <= 0) { +- smp_wmb(); /* Write lists before ALL_QUEUED. */ +- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); +- } + ++ netfs_queue_read(rreq, subreq, size <= 0); + netfs_issue_read(rreq, subreq); + cond_resched(); + } while (size > 0); +-- +2.48.1 + diff --git a/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch new file mode 100644 index 0000000000000..0f65cb47ce2e6 --- /dev/null +++ b/ci/diffs/9998-sched_ext-Fix-invalid-irq-restore-in-scx_ops_bypass.patch @@ -0,0 +1,56 @@ +From 10e1d78546b3dd4ea9d773c0b0257064a99211e9 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Wed, 11 Dec 2024 11:01:51 -1000 +Subject: [PATCH] sched_ext: Fix invalid irq restore in scx_ops_bypass() + +While adding outer irqsave/restore locking, 0e7ffff1b811 ("scx: Fix raciness +in scx_ops_bypass()") forgot to convert an inner rq_unlock_irqrestore() to +rq_unlock() which could re-enable IRQ prematurely leading to the following +warning: + + raw_local_irq_restore() called with IRQs enabled + WARNING: CPU: 1 PID: 96 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x30/0x40 + ... + Sched_ext: create_dsq (enabling) + pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : warn_bogus_irq_restore+0x30/0x40 + lr : warn_bogus_irq_restore+0x30/0x40 + ... + Call trace: + warn_bogus_irq_restore+0x30/0x40 (P) + warn_bogus_irq_restore+0x30/0x40 (L) + scx_ops_bypass+0x224/0x3b8 + scx_ops_enable.isra.0+0x2c8/0xaa8 + bpf_scx_reg+0x18/0x30 + ... + irq event stamp: 33739 + hardirqs last enabled at (33739): [] scx_ops_bypass+0x174/0x3b8 + hardirqs last disabled at (33738): [] _raw_spin_lock_irqsave+0xb4/0xd8 + +Drop the stray _irqrestore(). + +Signed-off-by: Tejun Heo +Reported-by: Ihor Solodrai +Link: http://lkml.kernel.org/r/qC39k3UsonrBYD_SmuxHnZIQLsuuccoCrkiqb_BT7DvH945A1_LZwE4g-5Pu9FcCtqZt4lY1HhIPi0homRuNWxkgo1rgP3bkxa0donw8kV4=@pm.me +Fixes: 0e7ffff1b811 ("scx: Fix raciness in scx_ops_bypass()") +Cc: stable@vger.kernel.org # v6.12 +--- + kernel/sched/ext.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 7fff1d045477..98519e6d0dcd 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { +- rq_unlock_irqrestore(rq, &rf); ++ rq_unlock(rq, &rf); + continue; + } + +-- +2.47.1 + diff --git a/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch new file mode 100644 index 0000000000000..9b5e6d50778e1 --- /dev/null +++ b/ci/diffs/9999-scx-Fix-maximal-BPF-selftest-prog.patch @@ -0,0 +1,56 @@ +From 70414cacbe536a197d56f58a42f9563e5a01b8ec Mon Sep 17 00:00:00 2001 +From: David Vernet +Date: Mon, 9 Dec 2024 09:29:24 -0600 +Subject: [PATCH] scx: Fix maximal BPF selftest prog + +maximal.bpf.c is still dispatching to and consuming from SCX_DSQ_GLOBAL. +Let's have it use its own DSQ to avoid any runtime errors. + +Signed-off-by: David Vernet +--- + tools/testing/selftests/sched_ext/maximal.bpf.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c +index 4c005fa71810..430f5e13bf55 100644 +--- a/tools/testing/selftests/sched_ext/maximal.bpf.c ++++ b/tools/testing/selftests/sched_ext/maximal.bpf.c +@@ -12,6 +12,8 @@ + + char _license[] SEC("license") = "GPL"; + ++#define DSQ_ID 0 ++ + s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) + { +@@ -20,7 +22,7 @@ s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, + + void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) + { +- scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); ++ scx_bpf_dsq_insert(p, DSQ_ID, SCX_SLICE_DFL, enq_flags); + } + + void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) +@@ -28,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) + + void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) + { +- scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL); ++ scx_bpf_dsq_move_to_local(DSQ_ID); + } + + void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) +@@ -123,7 +125,7 @@ void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) + + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) + { +- return 0; ++ return scx_bpf_create_dsq(DSQ_ID, -1); + } + + void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) +-- +2.47.1 + diff --git a/ci/vmtest/configs/DENYLIST b/ci/vmtest/configs/DENYLIST new file mode 100644 index 0000000000000..2f9bf5c0aa016 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST @@ -0,0 +1,15 @@ +# TEMPORARY +btf_dump/btf_dump: syntax +kprobe_multi_bench_attach +core_reloc/enum64val +core_reloc/size___diff_sz +core_reloc/type_based___diff_sz +test_ima # All of CI is broken on it following 6.3-rc1 merge + +lwt_reroute # crashes kernel after netnext merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +tc_links_ingress # started failing after net-next merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +xdp_bonding/xdp_bonding_features # started failing after net merge from 359e54a93ab4 "l2tp: pass correct message length to ip6_append_data" +tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") +migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +connect_force_port # unreliably fails diff --git a/ci/vmtest/configs/DENYLIST.aarch64 b/ci/vmtest/configs/DENYLIST.aarch64 new file mode 100644 index 0000000000000..487b19ede4b61 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.aarch64 @@ -0,0 +1,4 @@ +cgrp_local_storage # libbpf: prog 'update_cookie_tracing': failed to attach: ERROR: strerror_r(-524)=22 +core_reloc_btfgen # run_core_reloc_tests:FAIL:run_btfgen unexpected error: 32512 (errno 22) +usdt/multispec # usdt_300_bad_attach unexpected pointer: 0x558c63d8f0 +xdp_bonding # whole test suite is very unstable on aarch64 diff --git a/ci/vmtest/configs/DENYLIST.rc b/ci/vmtest/configs/DENYLIST.rc new file mode 100644 index 0000000000000..8aa33e6b71443 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.rc @@ -0,0 +1,3 @@ +send_signal/send_signal_nmi # PMU events configure correctly but don't trigger NMI's for some reason (AMD nested virt) +send_signal/send_signal_nmi_thread # Same as above +token/obj_priv_implicit_token_envvar # Unknown root cause, but reliably fails diff --git a/ci/vmtest/configs/DENYLIST.s390x b/ci/vmtest/configs/DENYLIST.s390x new file mode 100644 index 0000000000000..9b90b615aea55 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.s390x @@ -0,0 +1,11 @@ +deny_namespace # not yet in bpf denylist +tc_redirect/tc_redirect_dtime # very flaky +lru_bug # not yet in bpf-next denylist +# Disabled temporarily for a crash. +# https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ +dummy_st_ops/dummy_init_ptr_arg +fexit_bpf2bpf +tailcalls +trace_ext +xdp_bpf2bpf +xdp_metadata diff --git a/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc new file mode 100644 index 0000000000000..a3c745d1f5b52 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc @@ -0,0 +1,904 @@ +arena_htab +async_stack_depth +bad_struct_ops/invalid_prog_reuse +bpf_cookie +bpf_iter/bpf_hash_map +bpf_iter/ksym +bpf_iter/tcp4 +bpf_iter/tcp6 +bpf_iter/udp4 +bpf_iter/udp6 +bpf_iter/unix +bpf_iter_setsockopt +bpf_iter_setsockopt_unix +bpf_mod_race +bpf_nf/tc-bpf-ct +bpf_nf/xdp-ct +bpf_tcp_ca/cubic +btf_dump/btf_dump: bitfields +btf_dump/btf_dump: packing +btf_dump/btf_dump: padding +btf_dump/btf_dump: syntax +btf_map_in_map +cb_refs +cgroup_get_current_cgroup_id +cgroup_iter/cgroup_iter__self_only_css_task +cgroup_tcp_skb +cgrp_kfunc +cls_redirect/cls_redirect_dynptr +connect_force_port +core_autosize +core_read_macros +core_reloc/type_id +core_reloc/type_id___missing_targets +core_reloc_btfgen/type_id +core_reloc_btfgen/type_id___missing_targets +cpumask/test_acquire_wrong_cpumask +cpumask/test_alloc_double_release +cpumask/test_alloc_free_cpumask +cpumask/test_alloc_no_release +cpumask/test_and_or_xor +cpumask/test_copy_any_anyand +cpumask/test_cpumask_null +cpumask/test_cpumask_weight +cpumask/test_first_firstzero_cpu +cpumask/test_firstand_nocpu +cpumask/test_global_mask_array_l2_rcu +cpumask/test_global_mask_array_one_rcu +cpumask/test_global_mask_array_rcu +cpumask/test_global_mask_nested_deep_array_rcu +cpumask/test_global_mask_nested_deep_rcu +cpumask/test_global_mask_nested_rcu +cpumask/test_global_mask_no_null_check +cpumask/test_global_mask_out_of_rcu +cpumask/test_global_mask_rcu +cpumask/test_global_mask_rcu_no_null_check +cpumask/test_insert_leave +cpumask/test_insert_remove_no_release +cpumask/test_insert_remove_release +cpumask/test_intersects_subset +cpumask/test_invalid_nested_array +cpumask/test_mutate_cpumask +cpumask/test_set_clear_cpu +cpumask/test_setall_clear_cpu +cpumask/test_test_and_set_clear +crypto_basic/crypto_acquire +crypto_sanity +deny_namespace +dummy_st_ops/test_unsupported_field_sleepable +dynptr/add_dynptr_to_map1 +dynptr/add_dynptr_to_map2 +dynptr/clone_invalid1 +dynptr/clone_invalid2 +dynptr/clone_invalidate1 +dynptr/clone_invalidate2 +dynptr/clone_invalidate3 +dynptr/clone_invalidate4 +dynptr/clone_invalidate5 +dynptr/clone_invalidate6 +dynptr/clone_skb_packet_data +dynptr/clone_xdp_packet_data +dynptr/data_slice_missing_null_check1 +dynptr/data_slice_missing_null_check2 +dynptr/data_slice_out_of_bounds_map_value +dynptr/data_slice_out_of_bounds_ringbuf +dynptr/data_slice_out_of_bounds_skb +dynptr/data_slice_use_after_release1 +dynptr/data_slice_use_after_release2 +dynptr/dynptr_adjust_invalid +dynptr/dynptr_from_mem_invalid_api +dynptr/dynptr_invalidate_slice_failure +dynptr/dynptr_invalidate_slice_or_null +dynptr/dynptr_invalidate_slice_reinit +dynptr/dynptr_is_null_invalid +dynptr/dynptr_is_rdonly_invalid +dynptr/dynptr_overwrite_ref +dynptr/dynptr_partial_slot_invalidate +dynptr/dynptr_pruning_overwrite +dynptr/dynptr_pruning_type_confusion +dynptr/dynptr_read_into_slot +dynptr/dynptr_size_invalid +dynptr/dynptr_slice_var_len1 +dynptr/dynptr_slice_var_len2 +dynptr/dynptr_var_off_overwrite +dynptr/global +dynptr/invalid_data_slices +dynptr/invalid_helper1 +dynptr/invalid_helper2 +dynptr/invalid_offset +dynptr/invalid_read1 +dynptr/invalid_read2 +dynptr/invalid_read3 +dynptr/invalid_read4 +dynptr/invalid_slice_rdwr_rdonly +dynptr/invalid_write1 +dynptr/invalid_write2 +dynptr/invalid_write3 +dynptr/invalid_write4 +dynptr/release_twice +dynptr/release_twice_callback +dynptr/ringbuf_invalid_api +dynptr/ringbuf_missing_release1 +dynptr/ringbuf_missing_release2 +dynptr/ringbuf_missing_release_callback +dynptr/ringbuf_release_uninit_dynptr +dynptr/skb_invalid_ctx +dynptr/skb_invalid_ctx_fentry +dynptr/skb_invalid_ctx_fexit +dynptr/skb_invalid_data_slice1 +dynptr/skb_invalid_data_slice2 +dynptr/skb_invalid_data_slice3 +dynptr/skb_invalid_data_slice4 +dynptr/skb_invalid_slice_write +dynptr/test_dynptr_reg_type +dynptr/test_dynptr_skb_no_buff +dynptr/test_dynptr_skb_small_buff +dynptr/test_dynptr_skb_tp_btf +dynptr/test_read_write +dynptr/uninit_write_into_slot +dynptr/use_after_invalid +dynptr/xdp_invalid_ctx +dynptr/xdp_invalid_data_slice1 +dynptr/xdp_invalid_data_slice2 +exceptions/check_assert_eq_int_max +exceptions/check_assert_eq_int_min +exceptions/check_assert_eq_llong_max +exceptions/check_assert_eq_llong_min +exceptions/check_assert_eq_zero +exceptions/check_assert_ge_neg +exceptions/check_assert_ge_pos +exceptions/check_assert_ge_zero +exceptions/check_assert_generic +exceptions/check_assert_gt_neg +exceptions/check_assert_gt_pos +exceptions/check_assert_gt_zero +exceptions/check_assert_le_neg +exceptions/check_assert_le_pos +exceptions/check_assert_le_zero +exceptions/check_assert_lt_neg +exceptions/check_assert_lt_pos +exceptions/check_assert_lt_zero +exceptions/check_assert_range_s64 +exceptions/check_assert_range_u64 +exceptions/check_assert_single_range_s64 +exceptions/check_assert_single_range_u64 +exceptions/check_assert_with_return +exceptions/exception_ext +exceptions/exception_ext_mod_cb_runtime +exceptions/non-throwing extension -> non-throwing subprog +exceptions/non-throwing extension -> throwing global subprog +exceptions/non-throwing fentry -> exception_cb +exceptions/non-throwing fexit -> exception_cb +exceptions/non-throwing fmod_ret -> non-throwing global subprog +exceptions/reject_async_callback_throw +exceptions/reject_exception_throw_cb +exceptions/reject_exception_throw_cb_diff +exceptions/reject_set_exception_cb_bad_ret2 +exceptions/reject_subprog_with_lock +exceptions/reject_subprog_with_rcu_read_lock +exceptions/reject_with_cb +exceptions/reject_with_cb_reference +exceptions/reject_with_lock +exceptions/reject_with_rbtree_add_throw +exceptions/reject_with_rcu_read_lock +exceptions/reject_with_reference +exceptions/reject_with_subprog_reference +exceptions/throwing extension (with custom cb) -> exception_cb +exceptions/throwing extension -> global func in exception_cb +exceptions/throwing extension -> non-throwing global subprog +exceptions/throwing extension -> throwing global subprog +exceptions/throwing fentry -> exception_cb +exceptions/throwing fexit -> exception_cb +failures_wq +fexit_bpf2bpf/fmod_ret_freplace +fexit_bpf2bpf/func_replace +fexit_bpf2bpf/func_replace_global_func +fexit_bpf2bpf/func_replace_multi +fexit_bpf2bpf/func_sockmap_update +fexit_bpf2bpf/target_yes_callees +global_func_dead_code +global_map_resize +inner_array_lookup +irq/irq_flag_overwrite +irq/irq_flag_overwrite_partial +irq/irq_global_subprog +irq/irq_ooo_refs_array +irq/irq_restore_4_subprog +irq/irq_restore_bad_arg +irq/irq_restore_invalid +irq/irq_restore_iter +irq/irq_restore_missing_1_subprog +irq/irq_restore_missing_2 +irq/irq_restore_missing_2_subprog +irq/irq_restore_missing_3 +irq/irq_restore_missing_3_minus_2 +irq/irq_restore_missing_3_minus_2_subprog +irq/irq_restore_missing_3_subprog +irq/irq_restore_ooo +irq/irq_restore_ooo_3 +irq/irq_restore_ooo_3_subprog +irq/irq_save_bad_arg +irq/irq_save_invalid +irq/irq_save_iter +irq/irq_sleepable_helper +irq/irq_sleepable_kfunc +iters/compromise_iter_w_direct_write_and_skip_destroy_fail +iters/compromise_iter_w_direct_write_fail +iters/compromise_iter_w_helper_write_fail +iters/create_and_forget_to_destroy_fail +iters/css_task +iters/delayed_precision_mark +iters/delayed_read_mark +iters/destroy_without_creating_fail +iters/double_create_fail +iters/double_destroy_fail +iters/iter_css_lock_and_unlock +iters/iter_css_task_for_each +iters/iter_css_without_lock +iters/iter_destroy_bad_arg +iters/iter_err_too_permissive1 +iters/iter_err_too_permissive2 +iters/iter_err_too_permissive3 +iters/iter_err_unsafe_asm_loop +iters/iter_err_unsafe_c_loop +iters/iter_nested_iters +iters/iter_new_bad_arg +iters/iter_next_bad_arg +iters/iter_next_ptr_mem_not_trusted +iters/iter_next_rcu_not_trusted +iters/iter_next_rcu_or_null +iters/iter_next_trusted_or_null +iters/iter_obfuscate_counter +iters/iter_subprog_iters +iters/iter_tasks_lock_and_unlock +iters/iter_tasks_without_lock +iters/leak_iter_from_subprog_fail +iters/loop_state_deps1 +iters/loop_state_deps2 +iters/missing_null_check_fail +iters/next_after_destroy_fail +iters/next_without_new_fail +iters/read_from_iter_slot_fail +iters/stacksafe_should_not_conflate_stack_spill_and_iter +iters/testmod_seq_getter_after_bad +iters/testmod_seq_getter_before_bad +iters/wrong_sized_read_fail +jeq_infer_not_null +jit_probe_mem +kfree_skb +kfunc_call/kfunc_call_ctx +kfunc_call/kfunc_call_test1 +kfunc_call/kfunc_call_test2 +kfunc_call/kfunc_call_test4 +kfunc_call/kfunc_call_test_get_mem +kfunc_call/kfunc_call_test_ref_btf_id +kfunc_call/kfunc_call_test_static_unused_arg +kfunc_call/kfunc_syscall_test +kfunc_call/kfunc_syscall_test_null +kfunc_dynptr_param/not_ptr_to_stack +kfunc_dynptr_param/not_valid_dynptr +kfunc_param_nullable/kfunc_dynptr_nullable_test3 +kprobe_multi_test/kprobe_session_return_2 +kptr_xchg_inline +l4lb_all/l4lb_noinline +l4lb_all/l4lb_noinline_dynptr +linked_list +local_kptr_stash/drop_rb_node_off +local_kptr_stash/local_kptr_stash_local_with_root +local_kptr_stash/local_kptr_stash_plain +local_kptr_stash/local_kptr_stash_simple +local_kptr_stash/local_kptr_stash_unstash +local_kptr_stash/refcount_acquire_without_unstash +local_kptr_stash/stash_rb_nodes +log_buf/obj_load_log_buf +log_fixup/bad_core_relo_subprog +log_fixup/bad_core_relo_trunc_full +lru_bug +map_btf +map_in_map/acc_map_in_array +map_in_map/acc_map_in_htab +map_in_map/sleepable_acc_map_in_array +map_in_map/sleepable_acc_map_in_htab +map_kptr/correct_btf_id_check_size +map_kptr/inherit_untrusted_on_walk +map_kptr/kptr_xchg_possibly_null +map_kptr/kptr_xchg_ref_state +map_kptr/mark_ref_as_untrusted_or_null +map_kptr/marked_as_untrusted_or_null +map_kptr/non_const_var_off +map_kptr/non_const_var_off_kptr_xchg +map_kptr/reject_bad_type_xchg +map_kptr/reject_kptr_xchg_on_unref +map_kptr/reject_member_of_ref_xchg +map_kptr/reject_untrusted_xchg +map_kptr/success-map +map_ptr +nested_trust/test_invalid_nested_user_cpus +nested_trust/test_invalid_skb_field +percpu_alloc/array +percpu_alloc/array_sleepable +percpu_alloc/cgrp_local_storage +percpu_alloc/test_array_map_1 +percpu_alloc/test_array_map_2 +percpu_alloc/test_array_map_3 +percpu_alloc/test_array_map_4 +percpu_alloc/test_array_map_5 +percpu_alloc/test_array_map_6 +percpu_alloc/test_array_map_7 +percpu_alloc/test_array_map_8 +perf_branches/perf_branches_no_hw +pkt_access +preempt_lock/preempt_global_subprog_test +preempt_lock/preempt_lock_missing_1 +preempt_lock/preempt_lock_missing_1_subprog +preempt_lock/preempt_lock_missing_2 +preempt_lock/preempt_lock_missing_2_minus_1_subprog +preempt_lock/preempt_lock_missing_2_subprog +preempt_lock/preempt_lock_missing_3 +preempt_lock/preempt_lock_missing_3_minus_2 +preempt_lock/preempt_sleepable_helper +preempt_lock/preempt_sleepable_kfunc +preempted_bpf_ma_op +prog_run_opts +prog_tests_framework +raw_tp_null +rbtree_fail +rbtree_success +recursion +refcounted_kptr +refcounted_kptr_fail +refcounted_kptr_wrong_owner +reference_tracking/sk_lookup_success +ringbuf_multi +setget_sockopt +sk_lookup +skc_to_unix_sock +sock_addr/recvmsg4: attach prog with wrong attach type +sock_addr/recvmsg4: recvfrom (dgram) +sock_addr/recvmsg6: attach prog with wrong attach type +sock_addr/recvmsg6: recvfrom (dgram) +sock_addr/sendmsg4: attach prog with wrong attach type +sock_addr/sendmsg4: kernel_sendmsg (dgram) +sock_addr/sendmsg4: kernel_sendmsg deny (dgram) +sock_addr/sendmsg4: sendmsg (dgram) +sock_addr/sendmsg4: sendmsg deny (dgram) +sock_addr/sendmsg4: sock_sendmsg (dgram) +sock_addr/sendmsg4: sock_sendmsg deny (dgram) +sock_addr/sendmsg6: attach prog with wrong attach type +sock_addr/sendmsg6: kernel_sendmsg (dgram) +sock_addr/sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: kernel_sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg (dgram) +sock_addr/sendmsg6: sendmsg IPv4-mapped IPv6 (dgram) +sock_addr/sendmsg6: sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg (dgram) +sock_addr/sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg deny (dgram) +sock_destroy/trace_tcp_destroy_sock +sock_fields +sockmap_listen/sockhash IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 UDP test_reuseport_select_connected +spin_lock +struct_ops_module/unsupported_ops +syscall +tailcalls/classifier_0 +tailcalls/classifier_1 +tailcalls/reject_tail_call_preempt_lock +tailcalls/reject_tail_call_rcu_lock +tailcalls/reject_tail_call_ref +tailcalls/reject_tail_call_spin_lock +tailcalls/tailcall_6 +tailcalls/tailcall_bpf2bpf_2 +tailcalls/tailcall_bpf2bpf_3 +tailcalls/tailcall_bpf2bpf_fentry +tailcalls/tailcall_bpf2bpf_fentry_entry +tailcalls/tailcall_bpf2bpf_fentry_fexit +tailcalls/tailcall_bpf2bpf_fexit +tailcalls/tailcall_bpf2bpf_hierarchy_2 +tailcalls/tailcall_bpf2bpf_hierarchy_3 +task_kfunc +task_local_storage/uptr_across_pages +task_local_storage/uptr_basic +task_local_storage/uptr_kptr_xchg +task_local_storage/uptr_map_failure_e2big +task_local_storage/uptr_map_failure_kstruct +task_local_storage/uptr_map_failure_size0 +task_local_storage/uptr_no_null_check +task_local_storage/uptr_obj_new +task_local_storage/uptr_update_failure +tc_bpf/tc_bpf_non_root +tc_redirect/tc_redirect_dtime +tcp_custom_syncookie +tcp_hdr_options +test_bpf_ma +test_global_funcs/arg_tag_ctx_kprobe +test_global_funcs/arg_tag_ctx_perf +test_global_funcs/arg_tag_ctx_raw_tp +test_global_funcs/global_func1 +test_global_funcs/global_func10 +test_global_funcs/global_func11 +test_global_funcs/global_func12 +test_global_funcs/global_func13 +test_global_funcs/global_func14 +test_global_funcs/global_func15 +test_global_funcs/global_func15_tricky_pruning +test_global_funcs/global_func17 +test_global_funcs/global_func3 +test_global_funcs/global_func5 +test_global_funcs/global_func6 +test_global_funcs/global_func7 +test_lsm/lsm_basic +test_profiler +test_strncmp/strncmp_bad_not_null_term_target +timer +timer_mim +token +tp_btf_nullable/handle_tp_btf_nullable_bare1 +tunnel +uprobe_multi_test/uprobe_sesison_return_2 +user_ringbuf/user_ringbuf_callback_bad_access1 +user_ringbuf/user_ringbuf_callback_bad_access2 +user_ringbuf/user_ringbuf_callback_const_ptr_to_dynptr_reg_off +user_ringbuf/user_ringbuf_callback_discard_dynptr +user_ringbuf/user_ringbuf_callback_invalid_return +user_ringbuf/user_ringbuf_callback_null_context_read +user_ringbuf/user_ringbuf_callback_null_context_write +user_ringbuf/user_ringbuf_callback_reinit_dynptr_mem +user_ringbuf/user_ringbuf_callback_reinit_dynptr_ringbuf +user_ringbuf/user_ringbuf_callback_submit_dynptr +user_ringbuf/user_ringbuf_callback_write_forbidden +verif_scale_pyperf100 +verif_scale_pyperf180 +verif_scale_pyperf600 +verif_scale_pyperf600_nounroll +verif_scale_seg6_loop +verif_scale_strobemeta +verif_scale_strobemeta_nounroll1 +verif_scale_strobemeta_nounroll2 +verif_scale_strobemeta_subprogs +verif_scale_sysctl_loop1 +verif_scale_sysctl_loop2 +verif_scale_xdp_loop +verifier_and/invalid_and_of_negative_number +verifier_and/invalid_range_check +verifier_arena/iter_maps2 +verifier_arena/iter_maps3 +verifier_array_access/a_read_only_array_1_2 +verifier_array_access/a_read_only_array_2_2 +verifier_array_access/a_write_only_array_1_2 +verifier_array_access/a_write_only_array_2_2 +verifier_array_access/an_array_with_a_constant_2 +verifier_array_access/an_array_with_a_register_2 +verifier_array_access/an_array_with_a_variable_2 +verifier_array_access/array_with_no_floor_check +verifier_array_access/with_a_invalid_max_check_1 +verifier_array_access/with_a_invalid_max_check_2 +verifier_basic_stack/invalid_fp_arithmetic +verifier_basic_stack/misaligned_read_from_stack +verifier_basic_stack/stack_out_of_bounds +verifier_bitfield_write +verifier_bits_iter/destroy_uninit +verifier_bits_iter/next_uninit +verifier_bits_iter/no_destroy +verifier_bounds/bounds_map_value_variant_1 +verifier_bounds/bounds_map_value_variant_2 +verifier_bounds/of_boundary_crossing_range_1 +verifier_bounds/of_boundary_crossing_range_2 +verifier_bounds/on_sign_extended_mov_test1 +verifier_bounds/on_sign_extended_mov_test2 +verifier_bounds/reg32_any_reg32_xor_3 +verifier_bounds/reg_any_reg_xor_3 +verifier_bounds/shift_of_maybe_negative_number +verifier_bounds/shift_with_64_bit_input +verifier_bounds/shift_with_oversized_count_operand +verifier_bounds/size_signed_32bit_overflow_test1 +verifier_bounds/size_signed_32bit_overflow_test2 +verifier_bounds/size_signed_32bit_overflow_test3 +verifier_bounds/size_signed_32bit_overflow_test4 +verifier_bounds/var_off_insn_off_test1 +verifier_bounds/var_off_insn_off_test2 +verifier_bounds_deduction/deducing_bounds_from_const_1 +verifier_bounds_deduction/deducing_bounds_from_const_10 +verifier_bounds_deduction/deducing_bounds_from_const_3 +verifier_bounds_deduction/deducing_bounds_from_const_5 +verifier_bounds_deduction/deducing_bounds_from_const_6 +verifier_bounds_deduction/deducing_bounds_from_const_7 +verifier_bounds_deduction/deducing_bounds_from_const_8 +verifier_bounds_deduction/deducing_bounds_from_const_9 +verifier_bounds_mix_sign_unsign/checks_mixing_signed_and_unsigned +verifier_bounds_mix_sign_unsign/signed_and_unsigned_positive_bounds +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_10 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_11 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_12 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_13 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_14 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_15 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_2 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_3 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_5 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_6 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_8 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_16 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_32 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_8 +verifier_cfg/conditional_loop +verifier_cfg/loop2_back_edge +verifier_cfg/loop_back_edge +verifier_cfg/out_of_range_jump +verifier_cfg/out_of_range_jump2 +verifier_cfg/uncond_loop_after_cond_jmp +verifier_cfg/uncond_loop_in_subprog_after_cond_jmp +verifier_cfg/unreachable +verifier_cfg/unreachable2 +verifier_cgroup_inv_retcode/with_invalid_return_code_test1 +verifier_cgroup_inv_retcode/with_invalid_return_code_test3 +verifier_cgroup_inv_retcode/with_invalid_return_code_test5 +verifier_cgroup_inv_retcode/with_invalid_return_code_test6 +verifier_cgroup_inv_retcode/with_invalid_return_code_test7 +verifier_cgroup_skb/data_meta_for_cgroup_skb +verifier_cgroup_skb/flow_keys_for_cgroup_skb +verifier_cgroup_skb/napi_id_for_cgroup_skb +verifier_cgroup_skb/tc_classid_for_cgroup_skb +verifier_cgroup_storage/cpu_cgroup_storage_access_1 +verifier_cgroup_storage/cpu_cgroup_storage_access_2 +verifier_cgroup_storage/cpu_cgroup_storage_access_3 +verifier_cgroup_storage/cpu_cgroup_storage_access_4 +verifier_cgroup_storage/cpu_cgroup_storage_access_5 +verifier_cgroup_storage/cpu_cgroup_storage_access_6 +verifier_cgroup_storage/invalid_cgroup_storage_access_1 +verifier_cgroup_storage/invalid_cgroup_storage_access_2 +verifier_cgroup_storage/invalid_cgroup_storage_access_3 +verifier_cgroup_storage/invalid_cgroup_storage_access_4 +verifier_cgroup_storage/invalid_cgroup_storage_access_5 +verifier_cgroup_storage/invalid_cgroup_storage_access_6 +verifier_const/bprm +verifier_const/tcx1 +verifier_const/tcx4 +verifier_const/tcx7 +verifier_const_or/not_bypass_stack_boundary_checks_1 +verifier_const_or/not_bypass_stack_boundary_checks_2 +verifier_ctx/context_stores_via_bpf_atomic +verifier_ctx/ctx_pointer_to_helper_1 +verifier_ctx/ctx_pointer_to_helper_2 +verifier_ctx/ctx_pointer_to_helper_3 +verifier_ctx/make_ptr_to_ctx_unusable +verifier_ctx/null_check_4_ctx_const +verifier_ctx/null_check_8_null_bind +verifier_ctx/or_null_check_3_1 +verifier_ctx_sk_msg/of_size_in_sk_msg +verifier_ctx_sk_msg/past_end_of_sk_msg +verifier_ctx_sk_msg/read_offset_in_sk_msg +verifier_d_path/d_path_reject +verifier_direct_packet_access/access_test15_spill_with_xadd +verifier_direct_packet_access/direct_packet_access_test3 +verifier_direct_packet_access/id_in_regsafe_bad_access +verifier_direct_packet_access/packet_access_test10_write_invalid +verifier_direct_packet_access/pkt_end_reg_bad_access +verifier_direct_packet_access/pkt_end_reg_both_accesses +verifier_direct_packet_access/test16_arith_on_data_end +verifier_direct_packet_access/test23_x_pkt_ptr_4 +verifier_direct_packet_access/test26_marking_on_bad_access +verifier_direct_packet_access/test28_marking_on_bad_access +verifier_direct_stack_access_wraparound +verifier_global_ptr_args +verifier_global_subprogs +verifier_helper_access_var_len/bitwise_and_jmp_wrong_max +verifier_helper_access_var_len/jmp_signed_no_min_check +verifier_helper_access_var_len/map_adjusted_jmp_wrong_max +verifier_helper_access_var_len/memory_map_jmp_wrong_max +verifier_helper_access_var_len/memory_stack_jmp_bounds_offset +verifier_helper_access_var_len/memory_stack_jmp_wrong_max +verifier_helper_access_var_len/ptr_to_mem_or_null_2 +verifier_helper_access_var_len/ptr_to_mem_or_null_8 +verifier_helper_access_var_len/ptr_to_mem_or_null_9 +verifier_helper_access_var_len/stack_jmp_no_max_check +verifier_helper_packet_access/cls_helper_fail_range_1 +verifier_helper_packet_access/cls_helper_fail_range_2 +verifier_helper_packet_access/cls_helper_fail_range_3 +verifier_helper_packet_access/packet_ptr_with_bad_range_1 +verifier_helper_packet_access/packet_ptr_with_bad_range_2 +verifier_helper_packet_access/packet_test2_unchecked_packet_ptr +verifier_helper_packet_access/ptr_with_too_short_range_1 +verifier_helper_packet_access/ptr_with_too_short_range_2 +verifier_helper_packet_access/test11_cls_unsuitable_helper_1 +verifier_helper_packet_access/test12_cls_unsuitable_helper_2 +verifier_helper_packet_access/test15_cls_helper_fail_sub +verifier_helper_packet_access/test20_pkt_end_as_input +verifier_helper_packet_access/test7_cls_unchecked_packet_ptr +verifier_helper_packet_access/to_packet_test21_wrong_reg +verifier_helper_restricted +verifier_helper_value_access/access_to_map_empty_range +verifier_helper_value_access/access_to_map_negative_range +verifier_helper_value_access/access_to_map_possibly_empty_range +verifier_helper_value_access/access_to_map_wrong_size +verifier_helper_value_access/bounds_check_using_bad_access_1 +verifier_helper_value_access/bounds_check_using_bad_access_2 +verifier_helper_value_access/check_using_s_bad_access_1 +verifier_helper_value_access/check_using_s_bad_access_2 +verifier_helper_value_access/const_imm_negative_range_adjustment_1 +verifier_helper_value_access/const_imm_negative_range_adjustment_2 +verifier_helper_value_access/const_reg_negative_range_adjustment_1 +verifier_helper_value_access/const_reg_negative_range_adjustment_2 +verifier_helper_value_access/imm_out_of_bound_1 +verifier_helper_value_access/imm_out_of_bound_2 +verifier_helper_value_access/imm_out_of_bound_range +verifier_helper_value_access/map_out_of_bound_range +verifier_helper_value_access/map_via_variable_empty_range +verifier_helper_value_access/reg_out_of_bound_1 +verifier_helper_value_access/reg_out_of_bound_2 +verifier_helper_value_access/reg_out_of_bound_range +verifier_helper_value_access/via_const_imm_empty_range +verifier_helper_value_access/via_const_reg_empty_range +verifier_helper_value_access/via_variable_no_max_check_1 +verifier_helper_value_access/via_variable_no_max_check_2 +verifier_helper_value_access/via_variable_wrong_max_check_1 +verifier_helper_value_access/via_variable_wrong_max_check_2 +verifier_int_ptr/arg_ptr_to_long_misaligned +verifier_int_ptr/to_long_size_sizeof_long +verifier_iterating_callbacks/bpf_loop_iter_limit_overflow +verifier_iterating_callbacks/check_add_const_3regs +verifier_iterating_callbacks/check_add_const_3regs_2if +verifier_iterating_callbacks/check_add_const_regsafe_off +verifier_iterating_callbacks/iter_limit_bug +verifier_iterating_callbacks/jgt_imm64_and_may_goto +verifier_iterating_callbacks/loop_detection +verifier_iterating_callbacks/may_goto_self +verifier_iterating_callbacks/unsafe_find_vma +verifier_iterating_callbacks/unsafe_for_each_map_elem +verifier_iterating_callbacks/unsafe_on_2nd_iter +verifier_iterating_callbacks/unsafe_on_zero_iter +verifier_iterating_callbacks/unsafe_ringbuf_drain +verifier_jeq_infer_not_null/unchanged_for_jeq_false_branch +verifier_jeq_infer_not_null/unchanged_for_jne_true_branch +verifier_kfunc_prog_types/cgrp_kfunc_raw_tp +verifier_kfunc_prog_types/cpumask_kfunc_raw_tp +verifier_kfunc_prog_types/task_kfunc_raw_tp +verifier_ld_ind/ind_check_calling_conv_r1 +verifier_ld_ind/ind_check_calling_conv_r2 +verifier_ld_ind/ind_check_calling_conv_r3 +verifier_ld_ind/ind_check_calling_conv_r4 +verifier_ld_ind/ind_check_calling_conv_r5 +verifier_leak_ptr/leak_pointer_into_ctx_1 +verifier_leak_ptr/leak_pointer_into_ctx_2 +verifier_linked_scalars +verifier_loops1/bounded_recursion +verifier_loops1/infinite_loop_in_two_jumps +verifier_loops1/infinite_loop_three_jump_trick +verifier_loops1/loop_after_a_conditional_jump +verifier_lsm/bool_retval_test3 +verifier_lsm/bool_retval_test4 +verifier_lsm/disabled_hook_test1 +verifier_lsm/disabled_hook_test2 +verifier_lsm/disabled_hook_test3 +verifier_lsm/errno_zero_retval_test4 +verifier_lsm/errno_zero_retval_test5 +verifier_lsm/errno_zero_retval_test6 +verifier_lwt/not_permitted_for_lwt_prog +verifier_lwt/packet_write_for_lwt_in +verifier_lwt/packet_write_for_lwt_out +verifier_lwt/tc_classid_for_lwt_in +verifier_lwt/tc_classid_for_lwt_out +verifier_lwt/tc_classid_for_lwt_xmit +verifier_map_in_map/invalid_inner_map_pointer +verifier_map_in_map/on_the_inner_map_pointer +verifier_map_ptr/bpf_map_ptr_write_rejected +verifier_map_ptr/read_non_existent_field_rejected +verifier_map_ptr/read_with_negative_offset_rejected +verifier_map_ptr_mixing +verifier_map_ret_val +verifier_meta_access/meta_access_test10 +verifier_meta_access/meta_access_test2 +verifier_meta_access/meta_access_test3 +verifier_meta_access/meta_access_test4 +verifier_meta_access/meta_access_test5 +verifier_meta_access/meta_access_test6 +verifier_meta_access/meta_access_test9 +verifier_netfilter_ctx/with_invalid_ctx_access_test1 +verifier_netfilter_ctx/with_invalid_ctx_access_test2 +verifier_netfilter_ctx/with_invalid_ctx_access_test3 +verifier_netfilter_ctx/with_invalid_ctx_access_test4 +verifier_netfilter_ctx/with_invalid_ctx_access_test5 +verifier_netfilter_retcode/with_invalid_return_code_test1 +verifier_netfilter_retcode/with_invalid_return_code_test4 +verifier_or_jmp32_k +verifier_prevent_map_lookup +verifier_raw_stack/bytes_spilled_regs_corruption_2 +verifier_raw_stack/load_bytes_invalid_access_1 +verifier_raw_stack/load_bytes_invalid_access_2 +verifier_raw_stack/load_bytes_invalid_access_3 +verifier_raw_stack/load_bytes_invalid_access_4 +verifier_raw_stack/load_bytes_invalid_access_5 +verifier_raw_stack/load_bytes_invalid_access_6 +verifier_raw_stack/load_bytes_negative_len_2 +verifier_raw_stack/load_bytes_spilled_regs_corruption +verifier_raw_stack/skb_load_bytes_negative_len +verifier_raw_stack/skb_load_bytes_zero_len +verifier_raw_tp_writable +verifier_ref_tracking +verifier_reg_equal/subreg_equality_2 +verifier_regalloc/regalloc_and_spill_negative +verifier_regalloc/regalloc_negative +verifier_regalloc/regalloc_src_reg_negative +verifier_ringbuf/ringbuf_invalid_reservation_offset_1 +verifier_ringbuf/ringbuf_invalid_reservation_offset_2 +verifier_runtime_jit +verifier_scalar_ids/check_ids_in_regsafe +verifier_scalar_ids/check_ids_in_regsafe_2 +verifier_scalar_ids/linked_regs_broken_link_2 +verifier_search_pruning/for_u32_spills_u64_fill +verifier_search_pruning/liveness_pruning_and_write_screening +verifier_search_pruning/short_loop1 +verifier_search_pruning/should_be_verified_nop_operation +verifier_search_pruning/tracking_for_u32_spill_fill +verifier_search_pruning/varlen_map_value_access_pruning +verifier_sock/bpf_sk_fullsock_skb_sk +verifier_sock/bpf_sk_release_skb_sk +verifier_sock/bpf_tcp_sock_skb_sk +verifier_sock/dst_port_byte_load_invalid +verifier_sock/dst_port_half_load_invalid_1 +verifier_sock/dst_port_half_load_invalid_2 +verifier_sock/invalidate_pkt_pointers_by_tail_call +verifier_sock/invalidate_pkt_pointers_from_global_func +verifier_sock/map_lookup_elem_smap_key +verifier_sock/map_lookup_elem_sockhash_key +verifier_sock/map_lookup_elem_sockmap_key +verifier_sock/no_null_check_on_ret_1 +verifier_sock/no_null_check_on_ret_2 +verifier_sock/of_bpf_skc_to_helpers +verifier_sock/post_bind4_read_mark +verifier_sock/post_bind4_read_src_ip6 +verifier_sock/post_bind6_read_src_ip4 +verifier_sock/sk_1_1_value_1 +verifier_sock/sk_no_skb_sk_check_1 +verifier_sock/sk_no_skb_sk_check_2 +verifier_sock/sk_sk_type_fullsock_field_1 +verifier_sock/skb_sk_beyond_last_field_1 +verifier_sock/skb_sk_beyond_last_field_2 +verifier_sock/skb_sk_no_null_check +verifier_sock/sock_create_read_src_port +verifier_sock_addr/bind4_bad_return_code +verifier_sock_addr/bind6_bad_return_code +verifier_sock_addr/connect4_bad_return_code +verifier_sock_addr/connect6_bad_return_code +verifier_sock_addr/connect_unix_bad_return_code +verifier_sock_addr/getpeername4_bad_return_code +verifier_sock_addr/getpeername6_bad_return_code +verifier_sock_addr/getpeername_unix_bad_return_code +verifier_sock_addr/getsockname4_bad_return_code +verifier_sock_addr/getsockname6_bad_return_code +verifier_sock_addr/getsockname_unix_unix_bad_return_code +verifier_sock_addr/recvmsg4_bad_return_code +verifier_sock_addr/recvmsg6_bad_return_code +verifier_sock_addr/recvmsg_unix_bad_return_code +verifier_sock_addr/sendmsg4_bad_return_code +verifier_sock_addr/sendmsg6_bad_return_code +verifier_sock_addr/sendmsg_unix_bad_return_code +verifier_sockmap_mutate/test_flow_dissector_update +verifier_sockmap_mutate/test_raw_tp_delete +verifier_sockmap_mutate/test_raw_tp_update +verifier_sockmap_mutate/test_sockops_update +verifier_spill_fill/_6_offset_to_skb_data +verifier_spill_fill/addr_offset_to_skb_data +verifier_spill_fill/check_corrupted_spill_fill +verifier_spill_fill/fill_32bit_after_spill_64bit_clear_id +verifier_spill_fill/spill_16bit_of_32bit_fail +verifier_spill_fill/spill_32bit_of_64bit_fail +verifier_spill_fill/u64_offset_to_skb_data +verifier_spill_fill/with_invalid_reg_offset_0 +verifier_spin_lock/call_within_a_locked_region +verifier_spin_lock/lock_test2_direct_ld_st +verifier_spin_lock/lock_test3_direct_ld_st +verifier_spin_lock/lock_test4_direct_ld_st +verifier_spin_lock/lock_test7_unlock_without_lock +verifier_spin_lock/reg_id_for_map_value +verifier_spin_lock/spin_lock_test6_missing_unlock +verifier_spin_lock/spin_lock_test8_double_lock +verifier_spin_lock/spin_lock_test9_different_lock +verifier_spin_lock/test11_ld_abs_under_lock +verifier_stack_ptr/load_bad_alignment_on_off +verifier_stack_ptr/load_bad_alignment_on_reg +verifier_stack_ptr/load_out_of_bounds_high +verifier_stack_ptr/load_out_of_bounds_low +verifier_stack_ptr/to_stack_check_high_4 +verifier_stack_ptr/to_stack_check_high_5 +verifier_stack_ptr/to_stack_check_high_6 +verifier_stack_ptr/to_stack_check_high_7 +verifier_stack_ptr/to_stack_check_low_3 +verifier_stack_ptr/to_stack_check_low_4 +verifier_stack_ptr/to_stack_check_low_5 +verifier_stack_ptr/to_stack_check_low_6 +verifier_stack_ptr/to_stack_check_low_7 +verifier_subprog_precision/callback_precise_return_fail +verifier_tailcall_jit +verifier_uninit +verifier_unpriv +verifier_unpriv_perf +verifier_value/store_of_cleared_call_register +verifier_value_illegal_alu +verifier_value_or_null/map_access_from_else_condition +verifier_value_or_null/map_value_or_null_1 +verifier_value_or_null/map_value_or_null_2 +verifier_value_or_null/map_value_or_null_3 +verifier_value_or_null/multiple_map_lookup_elem_calls +verifier_value_or_null/null_check_ids_in_regsafe +verifier_value_ptr_arith/access_known_scalar_value_ptr_2 +verifier_value_ptr_arith/access_unknown_scalar_value_ptr +verifier_value_ptr_arith/access_value_ptr_known_scalar +verifier_value_ptr_arith/access_value_ptr_unknown_scalar +verifier_value_ptr_arith/access_value_ptr_value_ptr_1 +verifier_value_ptr_arith/access_value_ptr_value_ptr_2 +verifier_value_ptr_arith/lower_oob_arith_test_1 +verifier_value_ptr_arith/to_leak_tainted_dst_reg +verifier_value_ptr_arith/unknown_scalar_value_ptr_4 +verifier_value_ptr_arith/value_ptr_known_scalar_2_1 +verifier_value_ptr_arith/value_ptr_known_scalar_3 +verifier_var_off/access_max_out_of_bound +verifier_var_off/access_min_out_of_bound +verifier_var_off/stack_write_clobbers_spilled_regs +verifier_var_off/variable_offset_ctx_access +verifier_var_off/variable_offset_stack_access_unbounded +verifier_var_off/zero_sized_access_max_out_of_bound +verifier_vfs_reject +verifier_xadd/xadd_w_check_unaligned_map +verifier_xadd/xadd_w_check_unaligned_pkt +verifier_xadd/xadd_w_check_unaligned_stack +verifier_xdp_direct_packet_access/corner_case_1_bad_access_1 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_10 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_11 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_12 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_13 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_14 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_15 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_16 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_2 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_3 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_4 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_5 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_6 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_7 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_8 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_9 +verifier_xdp_direct_packet_access/end_mangling_bad_access_1 +verifier_xdp_direct_packet_access/end_mangling_bad_access_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_5 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_6 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_7 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_8 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_4 +verify_pkcs7_sig +xdp_synproxy diff --git a/ci/vmtest/configs/DENYLIST.x86_64 b/ci/vmtest/configs/DENYLIST.x86_64 new file mode 100644 index 0000000000000..6fc3413daab9f --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.x86_64 @@ -0,0 +1 @@ +netcnt # with kvm enabled, fail with packets unexpected packets: actual 10001 != expected 10000 diff --git a/ci/vmtest/configs/run-vmtest.env b/ci/vmtest/configs/run-vmtest.env new file mode 100644 index 0000000000000..c60f1db6673c7 --- /dev/null +++ b/ci/vmtest/configs/run-vmtest.env @@ -0,0 +1,42 @@ +#!/bin/bash + +# This file is sourced by libbpf/ci/run-vmtest Github Action scripts. +# +# The primary reason it exists is that assembling ALLOWLIST and +# DENYLIST for a particular test run is not a trivial operation. +# +# Users of libbpf/ci/run-vmtest action need to be able to specify a +# list of allow/denylist **files**, that later has to be correctly +# merged into a single allow/denylist passed to a test runner. +# +# Obviously it's perferrable for the scripts merging many lists into +# one to be reusable, and not copy-pasted between repositories which +# use libbpf/ci actions. And specifying the lists should be trivial. +# This file is a solution to that. + +# $SELFTESTS_BPF and $VMTEST_CONFIGS are set in the workflow, before +# libbpf/ci/run-vmtest action is called +# See .github/workflows/kernel-test.yml + +ALLOWLIST_FILES=( + "${SELFTESTS_BPF}/ALLOWLIST" + "${SELFTESTS_BPF}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST" + "${VMTEST_CONFIGS}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/ALLOWLIST.${KERNEL_TEST}" +) + +DENYLIST_FILES=( + "${SELFTESTS_BPF}/DENYLIST" + "${SELFTESTS_BPF}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST" + "${VMTEST_CONFIGS}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/DENYLIST.${KERNEL_TEST}" +) + +# Export pipe-separated strings, because bash doesn't support array export +export SELFTESTS_BPF_ALLOWLIST_FILES=$(IFS="|"; echo "${ALLOWLIST_FILES[*]}") +export SELFTESTS_BPF_DENYLIST_FILES=$(IFS="|"; echo "${DENYLIST_FILES[*]}") + diff --git a/ci/vmtest/configs/run_veristat.kernel.cfg b/ci/vmtest/configs/run_veristat.kernel.cfg new file mode 100644 index 0000000000000..807efc251073f --- /dev/null +++ b/ci/vmtest/configs/run_veristat.kernel.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${SELFTESTS_BPF}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_CFG_FILE="${SELFTESTS_BPF}/veristat.cfg" +VERISTAT_OUTPUT="veristat-kernel" diff --git a/ci/vmtest/configs/run_veristat.meta.cfg b/ci/vmtest/configs/run_veristat.meta.cfg new file mode 100644 index 0000000000000..14f08d241d206 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.meta.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${WORKING_DIR}/bpf_objects" +VERISTAT_OBJECTS_GLOB="*.o" +VERISTAT_OUTPUT="veristat-meta" +VERISTAT_CFG_FILE="${VERISTAT_CONFIGS}/veristat_meta.cfg" diff --git a/ci/vmtest/configs/veristat_meta.cfg b/ci/vmtest/configs/veristat_meta.cfg new file mode 100644 index 0000000000000..a8c25d71cb9e2 --- /dev/null +++ b/ci/vmtest/configs/veristat_meta.cfg @@ -0,0 +1,10 @@ +# List of exceptions we know about that are not going to work with veristat. + +# needs 'migrate_misplaced_page' which went away in +# commit 73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()") +!numamove_bpf-numamove_bpf.o + +# use non-libbpf loader +!takeover_bpf_lib-takeover.bpf.o +!tcp_tuner_bpf_lib-tcptuner.bpf.o + diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c index 814b16bb1953f..e5301fac13971 100644 --- a/drivers/accel/amdxdna/amdxdna_mailbox.c +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index fc92e43d0fe93..ef9444482db19 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -21,9 +21,15 @@ struct platform_profile_handler { struct device dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + unsigned long hidden_choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; const struct platform_profile_ops *ops; }; +struct aggregate_choices_data { + unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + int count; +}; + static const char * const profile_names[] = { [PLATFORM_PROFILE_LOW_POWER] = "low-power", [PLATFORM_PROFILE_COOL] = "cool", @@ -73,7 +79,7 @@ static int _store_class_profile(struct device *dev, void *data) lockdep_assert_held(&profile_lock); handler = to_pprof_handler(dev); - if (!test_bit(*bit, handler->choices)) + if (!test_bit(*bit, handler->choices) && !test_bit(*bit, handler->hidden_choices)) return -EOPNOTSUPP; return handler->ops->profile_set(dev, *bit); @@ -239,21 +245,44 @@ static const struct class platform_profile_class = { /** * _aggregate_choices - Aggregate the available profile choices * @dev: The device - * @data: The available profile choices + * @arg: struct aggregate_choices_data * * Return: 0 on success, -errno on failure */ -static int _aggregate_choices(struct device *dev, void *data) +static int _aggregate_choices(struct device *dev, void *arg) { + unsigned long tmp[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data *data = arg; struct platform_profile_handler *handler; - unsigned long *aggregate = data; lockdep_assert_held(&profile_lock); handler = to_pprof_handler(dev); - if (test_bit(PLATFORM_PROFILE_LAST, aggregate)) - bitmap_copy(aggregate, handler->choices, PLATFORM_PROFILE_LAST); + bitmap_or(tmp, handler->choices, handler->hidden_choices, PLATFORM_PROFILE_LAST); + if (test_bit(PLATFORM_PROFILE_LAST, data->aggregate)) + bitmap_copy(data->aggregate, tmp, PLATFORM_PROFILE_LAST); else - bitmap_and(aggregate, handler->choices, aggregate, PLATFORM_PROFILE_LAST); + bitmap_and(data->aggregate, tmp, data->aggregate, PLATFORM_PROFILE_LAST); + data->count++; + + return 0; +} + +/** + * _remove_hidden_choices - Remove hidden choices from aggregate data + * @dev: The device + * @arg: struct aggregate_choices_data + * + * Return: 0 on success, -errno on failure + */ +static int _remove_hidden_choices(struct device *dev, void *arg) +{ + struct aggregate_choices_data *data = arg; + struct platform_profile_handler *handler; + + lockdep_assert_held(&profile_lock); + handler = to_pprof_handler(dev); + bitmap_andnot(data->aggregate, handler->choices, + handler->hidden_choices, PLATFORM_PROFILE_LAST); return 0; } @@ -270,22 +299,31 @@ static ssize_t platform_profile_choices_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; int err; - set_bit(PLATFORM_PROFILE_LAST, aggregate); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { err = class_for_each_device(&platform_profile_class, NULL, - aggregate, _aggregate_choices); + &data, _aggregate_choices); if (err) return err; + if (data.count == 1) { + err = class_for_each_device(&platform_profile_class, NULL, + &data, _remove_hidden_choices); + if (err) + return err; + } } /* no profile handler registered any more */ - if (bitmap_empty(aggregate, PLATFORM_PROFILE_LAST)) + if (bitmap_empty(data.aggregate, PLATFORM_PROFILE_LAST)) return -EINVAL; - return _commmon_choices_show(aggregate, buf); + return _commmon_choices_show(data.aggregate, buf); } /** @@ -373,7 +411,10 @@ static ssize_t platform_profile_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; int ret; int i; @@ -381,13 +422,13 @@ static ssize_t platform_profile_store(struct device *dev, i = sysfs_match_string(profile_names, buf); if (i < 0 || i == PLATFORM_PROFILE_CUSTOM) return -EINVAL; - set_bit(PLATFORM_PROFILE_LAST, choices); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { ret = class_for_each_device(&platform_profile_class, NULL, - choices, _aggregate_choices); + &data, _aggregate_choices); if (ret) return ret; - if (!test_bit(i, choices)) + if (!test_bit(i, data.aggregate)) return -EOPNOTSUPP; ret = class_for_each_device(&platform_profile_class, NULL, &i, @@ -417,8 +458,14 @@ static int profile_class_registered(struct device *dev, const void *data) static umode_t profile_class_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { - if (!class_find_device(&platform_profile_class, NULL, NULL, profile_class_registered)) + struct device *dev; + + dev = class_find_device(&platform_profile_class, NULL, NULL, profile_class_registered); + if (!dev) return 0; + + put_device(dev); + return attr->mode; } @@ -447,12 +494,15 @@ EXPORT_SYMBOL_GPL(platform_profile_notify); */ int platform_profile_cycle(void) { + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; enum platform_profile_option next = PLATFORM_PROFILE_LAST; enum platform_profile_option profile = PLATFORM_PROFILE_LAST; - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int err; - set_bit(PLATFORM_PROFILE_LAST, choices); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { err = class_for_each_device(&platform_profile_class, NULL, &profile, _aggregate_profiles); @@ -464,14 +514,14 @@ int platform_profile_cycle(void) return -EINVAL; err = class_for_each_device(&platform_profile_class, NULL, - choices, _aggregate_choices); + &data, _aggregate_choices); if (err) return err; /* never iterate into a custom if all drivers supported it */ - clear_bit(PLATFORM_PROFILE_CUSTOM, choices); + clear_bit(PLATFORM_PROFILE_CUSTOM, data.aggregate); - next = find_next_bit_wrap(choices, + next = find_next_bit_wrap(data.aggregate, PLATFORM_PROFILE_LAST, profile + 1); @@ -526,6 +576,14 @@ struct device *platform_profile_register(struct device *dev, const char *name, return ERR_PTR(-EINVAL); } + if (ops->hidden_choices) { + err = ops->hidden_choices(drvdata, pprof->hidden_choices); + if (err) { + dev_err(dev, "platform_profile hidden_choices failed\n"); + return ERR_PTR(err); + } + } + guard(mutex)(&profile_lock); /* create class interface for individual handler */ diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 698897b29de24..586cc7d1d8aa3 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -590,6 +590,8 @@ static void acpi_idle_play_dead(struct cpuidle_device *dev, int index) raw_safe_halt(); else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { io_idle(cx->address); + } else if (cx->entry_method == ACPI_CSTATE_FFH) { + acpi_processor_ffh_play_dead(cx); } else return; } diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c index bc6bae76ccaf1..94c6446604fc9 100644 --- a/drivers/android/binderfs.c +++ b/drivers/android/binderfs.c @@ -274,6 +274,7 @@ static void binderfs_evict_inode(struct inode *inode) mutex_unlock(&binderfs_minors_mutex); if (refcount_dec_and_test(&device->ref)) { + hlist_del_init(&device->hlist); kfree(device->context.name); kfree(device); } diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 8e895ae45c86b..c842e2de6ef98 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -386,8 +386,12 @@ struct ahci_host_priv { static inline bool ahci_ignore_port(struct ahci_host_priv *hpriv, unsigned int portid) { - return portid >= hpriv->nports || - !(hpriv->mask_port_map & (1 << portid)); + if (portid >= hpriv->nports) + return true; + /* mask_port_map not set means that all ports are available */ + if (!hpriv->mask_port_map) + return false; + return !(hpriv->mask_port_map & (1 << portid)); } extern int ahci_ignore_sss; diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fdfa7b2662180..e7ace4b10f15b 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -541,6 +541,7 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv) hpriv->saved_port_map = port_map; } + /* mask_port_map not set means that all ports are available */ if (hpriv->mask_port_map) { dev_warn(dev, "masking port_map 0x%lx -> 0x%lx\n", port_map, diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 53b2c7719dc51..91d44302eac92 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -651,8 +651,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, * If no sub-node was found, keep this for device tree * compatibility */ - hpriv->mask_port_map |= BIT(0); - rc = ahci_platform_get_phy(hpriv, 0, dev, dev->of_node); if (rc) goto err_out; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 63ec2f2184319..c085dd81ebe7f 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4143,10 +4143,6 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, - { "Samsung SSD 870 QVO*", NULL, ATA_QUIRK_NO_NCQ_TRIM | - ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI | - ATA_QUIRK_NOLPM }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index dce24806a0529..2d32125c16fd4 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -935,9 +935,8 @@ static int octeon_cf_probe(struct platform_device *pdev) ap->mwdma_mask = enable_dma ? ATA_MWDMA4 : 0; /* True IDE mode needs a timer to poll for not-busy. */ - hrtimer_init(&cf_port->delayed_finish, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cf_port->delayed_finish.function = octeon_cf_delayed_finish; + hrtimer_setup(&cf_port->delayed_finish, octeon_cf_delayed_finish, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } else { /* 16 bit but not True IDE */ base = cs0 + 0x800; diff --git a/drivers/base/core.c b/drivers/base/core.c index 5a1f051981149..2fde698430dff 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2079,6 +2079,7 @@ static bool __fw_devlink_relax_cycles(struct fwnode_handle *con_handle, out: sup_handle->flags &= ~FWNODE_FLAG_VISITED; put_device(sup_dev); + put_device(con_dev); put_device(par_dev); return ret; } diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 2ee45841486bc..425c43b2d478a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1764,8 +1764,8 @@ void pm_runtime_init(struct device *dev) INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; - hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - dev->power.suspend_timer.function = pm_suspend_timer_fn; + hrtimer_setup(&dev->power.suspend_timer, pm_suspend_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); init_waitqueue_head(&dev->power.wait_queue); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d94ef37480bd4..16457330c1bc7 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1426,8 +1426,7 @@ static void nullb_setup_bwtimer(struct nullb *nullb) { ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; + hrtimer_setup(&nullb->bw_timer, nullb_bwtimer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } @@ -1604,8 +1603,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; + hrtimer_setup(&cmd->timer, null_cmd_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } cmd->error = BLK_STS_OK; cmd->nq = nq; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 529085181f355..ca9a67b5b537a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2715,9 +2715,12 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 90966dfbd2781..2a8d91963c63f 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2102,7 +2102,8 @@ static int btusb_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return submit_or_queue_tx_urb(hdev, urb); case HCI_SCODATA_PKT: - if (hci_conn_num(hdev, SCO_LINK) < 1) + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + hci_conn_num(hdev, SCO_LINK) < 1) return -ENODEV; urb = alloc_isoc_urb(hdev, skb); @@ -2576,7 +2577,8 @@ static int btusb_send_frame_intel(struct hci_dev *hdev, struct sk_buff *skb) return submit_or_queue_tx_urb(hdev, urb); case HCI_SCODATA_PKT: - if (hci_conn_num(hdev, SCO_LINK) < 1) + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + hci_conn_num(hdev, SCO_LINK) < 1) return -ENODEV; urb = alloc_isoc_urb(hdev, skb); @@ -3642,6 +3644,7 @@ static ssize_t force_poll_sync_write(struct file *file, } static const struct file_operations force_poll_sync_fops = { + .owner = THIS_MODULE, .open = simple_open, .read = force_poll_sync_read, .write = force_poll_sync_write, diff --git a/drivers/bus/mhi/host/pci_generic.c b/drivers/bus/mhi/host/pci_generic.c index c41119b9079f0..7ffea0f981628 100644 --- a/drivers/bus/mhi/host/pci_generic.c +++ b/drivers/bus/mhi/host/pci_generic.c @@ -1095,8 +1095,9 @@ static void mhi_pci_recovery_work(struct work_struct *work) err_unprepare: mhi_unprepare_after_power_down(mhi_cntrl); err_try_reset: - if (pci_reset_function(pdev)) - dev_err(&pdev->dev, "Recovery failed\n"); + err = pci_try_reset_function(pdev); + if (err) + dev_err(&pdev->dev, "Recovery failed: %d\n", err); } static void health_check(struct timer_list *t) diff --git a/drivers/bus/simple-pm-bus.c b/drivers/bus/simple-pm-bus.c index 5dea31769f9a8..d8e029e7e53f7 100644 --- a/drivers/bus/simple-pm-bus.c +++ b/drivers/bus/simple-pm-bus.c @@ -109,9 +109,29 @@ static int simple_pm_bus_runtime_resume(struct device *dev) return 0; } +static int simple_pm_bus_suspend(struct device *dev) +{ + struct simple_pm_bus *bus = dev_get_drvdata(dev); + + if (!bus) + return 0; + + return pm_runtime_force_suspend(dev); +} + +static int simple_pm_bus_resume(struct device *dev) +{ + struct simple_pm_bus *bus = dev_get_drvdata(dev); + + if (!bus) + return 0; + + return pm_runtime_force_resume(dev); +} + static const struct dev_pm_ops simple_pm_bus_pm_ops = { RUNTIME_PM_OPS(simple_pm_bus_runtime_suspend, simple_pm_bus_runtime_resume, NULL) - NOIRQ_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + NOIRQ_SYSTEM_SLEEP_PM_OPS(simple_pm_bus_suspend, simple_pm_bus_resume) }; #define ONLY_BUS ((void *) 1) /* Match if the device is only a bus. */ diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index c573ed2ee71a8..7811aa7340537 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -473,8 +473,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cdx_device *cdx_dev = to_cdx_device(dev); + ssize_t len; - return sysfs_emit(buf, "%s\n", cdx_dev->driver_override); + device_lock(dev); + len = sysfs_emit(buf, "%s\n", cdx_dev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 7174bfccc7b32..b95f6d0f17ede 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -152,8 +152,7 @@ static int timeriomem_rng_probe(struct platform_device *pdev) priv->period = ns_to_ktime(period * NSEC_PER_USEC); init_completion(&priv->completion); - hrtimer_init(&priv->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - priv->timer.function = timeriomem_rng_trigger; + hrtimer_setup(&priv->timer, timeriomem_rng_trigger, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); priv->rng_ops.name = dev_name(&pdev->dev); priv->rng_ops.read = timeriomem_rng_read; diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 2cf595d2e10b8..f7dd455dd0dd3 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -264,8 +264,8 @@ int misc_register(struct miscdevice *misc) device_create_with_groups(&misc_class, misc->parent, dev, misc, misc->groups, "%s", misc->name); if (IS_ERR(misc->this_device)) { + misc_minor_free(misc->minor); if (is_dynamic) { - misc_minor_free(misc->minor); misc->minor = MISC_DYNAMIC_MINOR; } err = PTR_ERR(misc->this_device); diff --git a/drivers/char/random.c b/drivers/char/random.c index 2581186fa61b7..92cbd24a36d8c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -278,7 +278,7 @@ static void crng_reseed(struct work_struct *work) WRITE_ONCE(base_crng.generation, next_gen); #ifdef CONFIG_VDSO_GETRANDOM /* base_crng.generation's invalid value is ULONG_MAX, while - * _vdso_rng_data.generation's invalid value is 0, so add one to the + * vdso_k_rng_data->generation's invalid value is 0, so add one to the * former to arrive at the latter. Use smp_store_release so that this * is ordered with the write above to base_crng.generation. Pairs with * the smp_rmb() before the syscall in the vDSO code. @@ -290,7 +290,7 @@ static void crng_reseed(struct work_struct *work) * because the vDSO side only checks whether the value changed, without * actually using or interpreting the value. */ - smp_store_release((unsigned long *)&__arch_get_k_vdso_rng_data()->generation, next_gen + 1); + smp_store_release((unsigned long *)&vdso_k_rng_data->generation, next_gen + 1); #endif if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; @@ -743,7 +743,7 @@ static void __cold _credit_init_bits(size_t bits) queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); #ifdef CONFIG_VDSO_GETRANDOM - WRITE_ONCE(__arch_get_k_vdso_rng_data()->is_ready, true); + WRITE_ONCE(vdso_k_rng_data->is_ready, true); #endif wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 24442485e73e7..18f92dd44d456 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -923,14 +923,14 @@ static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); ret = 0; - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_is_empty(pipe)) goto error_out; ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK); if (ret < 0) goto error_out; - occupancy = pipe_occupancy(pipe->head, pipe->tail); + occupancy = pipe_buf_usage(pipe); buf = alloc_buf(port->portdev->vdev, 0, occupancy); if (!buf) { diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c index a3fe98cd38382..82815428f8f92 100644 --- a/drivers/clocksource/jcore-pit.c +++ b/drivers/clocksource/jcore-pit.c @@ -114,6 +114,18 @@ static int jcore_pit_local_init(unsigned cpu) pit->periodic_delta = DIV_ROUND_CLOSEST(NSEC_PER_SEC, HZ * buspd); clockevents_config_and_register(&pit->ced, freq, 1, ULONG_MAX); + enable_percpu_irq(pit->ced.irq, IRQ_TYPE_NONE); + + return 0; +} + +static int jcore_pit_local_teardown(unsigned cpu) +{ + struct jcore_pit *pit = this_cpu_ptr(jcore_pit_percpu); + + pr_info("Local J-Core PIT teardown on cpu %u\n", cpu); + + disable_percpu_irq(pit->ced.irq); return 0; } @@ -168,6 +180,7 @@ static int __init jcore_pit_init(struct device_node *node) return -ENOMEM; } + irq_set_percpu_devid(pit_irq); err = request_percpu_irq(pit_irq, jcore_timer_interrupt, "jcore_pit", jcore_pit_percpu); if (err) { @@ -237,7 +250,7 @@ static int __init jcore_pit_init(struct device_node *node) cpuhp_setup_state(CPUHP_AP_JCORE_TIMER_STARTING, "clockevents/jcore:starting", - jcore_pit_local_init, NULL); + jcore_pit_local_init, jcore_pit_local_teardown); return 0; } diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9c4cc01fd51aa..f06b9bc999456 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2200,28 +2200,20 @@ static int knl_get_turbo_pstate(int cpu) return ret; } -static void hybrid_get_type(void *data) -{ - u8 *cpu_type = data; - - *cpu_type = get_this_hybrid_cpu_type(); -} - static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - u8 cpu_type = 0; - - smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u8 cpu_type = c->topo.intel_type; /* * Return the hybrid scaling factor for P-cores and use the * default core scaling for E-cores. */ - if (cpu_type == 0x40) + if (cpu_type == INTEL_CPU_TYPE_CORE) return hybrid_scaling_factor; - if (cpu_type == 0x20) + if (cpu_type == INTEL_CPU_TYPE_ATOM) return core_get_scaling(); } diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c index e2a1e4463b6f5..0470d7c175f4f 100644 --- a/drivers/devfreq/event/rockchip-dfi.c +++ b/drivers/devfreq/event/rockchip-dfi.c @@ -642,8 +642,7 @@ static int rockchip_ddr_perf_init(struct rockchip_dfi *dfi) if (ret) return ret; - hrtimer_init(&dfi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dfi->timer.function = rockchip_dfi_timer; + hrtimer_setup(&dfi->timer, rockchip_dfi_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); switch (dfi->ddr_type) { case ROCKCHIP_DDRTYPE_LPDDR2: diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index c14557efd5770..bbc3276992bb0 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -59,9 +59,6 @@ struct bam_desc_hw { #define DESC_FLAG_NWD BIT(12) #define DESC_FLAG_CMD BIT(11) -#define BAM_NDP_REVISION_START 0x20 -#define BAM_NDP_REVISION_END 0x27 - struct bam_async_desc { struct virt_dma_desc vd; @@ -401,7 +398,6 @@ struct bam_device { /* dma start transaction tasklet */ struct tasklet_struct task; - u32 bam_revision; }; /** @@ -445,10 +441,8 @@ static void bam_reset(struct bam_device *bdev) writel_relaxed(val, bam_addr(bdev, 0, BAM_CTRL)); /* set descriptor threshold, start with 4 bytes */ - if (in_range(bdev->bam_revision, BAM_NDP_REVISION_START, - BAM_NDP_REVISION_END)) - writel_relaxed(DEFAULT_CNT_THRSHLD, - bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); + writel_relaxed(DEFAULT_CNT_THRSHLD, + bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); /* Enable default set of h/w workarounds, ie all except BAM_FULL_PIPE */ writel_relaxed(BAM_CNFG_BITS_DEFAULT, bam_addr(bdev, 0, BAM_CNFG_BITS)); @@ -1006,10 +1000,9 @@ static void bam_apply_new_config(struct bam_chan *bchan, maxburst = bchan->slave.src_maxburst; else maxburst = bchan->slave.dst_maxburst; - if (in_range(bdev->bam_revision, BAM_NDP_REVISION_START, - BAM_NDP_REVISION_END)) - writel_relaxed(maxburst, - bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); + + writel_relaxed(maxburst, + bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); } bchan->reconfigure = 0; @@ -1199,11 +1192,10 @@ static int bam_init(struct bam_device *bdev) u32 val; /* read revision and configuration information */ - val = readl_relaxed(bam_addr(bdev, 0, BAM_REVISION)); - if (!bdev->num_ees) + if (!bdev->num_ees) { + val = readl_relaxed(bam_addr(bdev, 0, BAM_REVISION)); bdev->num_ees = (val >> NUM_EES_SHIFT) & NUM_EES_MASK; - - bdev->bam_revision = val & REVISION_MASK; + } /* check that configured EE is within range */ if (bdev->ee >= bdev->num_ees) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index 5c6a5b3589873..ce80ac4b1a1b0 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -83,7 +83,9 @@ struct tegra_adma; * @nr_channels: Number of DMA channels available. * @ch_fifo_size_mask: Mask for FIFO size field. * @sreq_index_offset: Slave channel index offset. + * @max_page: Maximum ADMA Channel Page. * @has_outstanding_reqs: If DMA channel can have outstanding requests. + * @set_global_pg_config: Global page programming. */ struct tegra_adma_chip_data { unsigned int (*adma_get_burst_config)(unsigned int burst_size); @@ -99,6 +101,7 @@ struct tegra_adma_chip_data { unsigned int nr_channels; unsigned int ch_fifo_size_mask; unsigned int sreq_index_offset; + unsigned int max_page; bool has_outstanding_reqs; void (*set_global_pg_config)(struct tegra_adma *tdma); }; @@ -854,6 +857,7 @@ static const struct tegra_adma_chip_data tegra210_chip_data = { .nr_channels = 22, .ch_fifo_size_mask = 0xf, .sreq_index_offset = 2, + .max_page = 0, .has_outstanding_reqs = false, .set_global_pg_config = NULL, }; @@ -871,6 +875,7 @@ static const struct tegra_adma_chip_data tegra186_chip_data = { .nr_channels = 32, .ch_fifo_size_mask = 0x1f, .sreq_index_offset = 4, + .max_page = 4, .has_outstanding_reqs = true, .set_global_pg_config = tegra186_adma_global_page_config, }; diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c index 04c42c83a2bad..f3da9385ca0d8 100644 --- a/drivers/edac/qcom_edac.c +++ b/drivers/edac/qcom_edac.c @@ -95,7 +95,7 @@ static int qcom_llcc_core_setup(struct llcc_drv_data *drv, struct regmap *llcc_b * Configure interrupt enable registers such that Tag, Data RAM related * interrupts are propagated to interrupt controller for servicing */ - ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_2_enable, + ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_0_enable, TRP0_INTERRUPT_ENABLE, TRP0_INTERRUPT_ENABLE); if (ret) @@ -113,7 +113,7 @@ static int qcom_llcc_core_setup(struct llcc_drv_data *drv, struct regmap *llcc_b if (ret) return ret; - ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_2_enable, + ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_0_enable, DRP0_INTERRUPT_ENABLE, DRP0_INTERRUPT_ENABLE); if (ret) diff --git a/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c b/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c index 83b69fc4fba5b..a8915d3b4df51 100644 --- a/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c +++ b/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c @@ -254,8 +254,8 @@ static int scmi_imx_misc_ctrl_set(const struct scmi_protocol_handle *ph, if (num > max_num) return -EINVAL; - ret = ph->xops->xfer_get_init(ph, SCMI_IMX_MISC_CTRL_SET, sizeof(*in), - 0, &t); + ret = ph->xops->xfer_get_init(ph, SCMI_IMX_MISC_CTRL_SET, + sizeof(*in) + num * sizeof(__le32), 0, &t); if (ret) return ret; diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index 5365e9a430007..42433c19eb308 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -1609,8 +1609,8 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, goto out_fw; } - ret = regmap_raw_write_async(regmap, reg, buf->buf, - le32_to_cpu(region->len)); + ret = regmap_raw_write(regmap, reg, buf->buf, + le32_to_cpu(region->len)); if (ret != 0) { cs_dsp_err(dsp, "%s.%d: Failed to write %d bytes at %d in %s: %d\n", @@ -1625,12 +1625,6 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, regions++; } - ret = regmap_async_complete(regmap); - if (ret != 0) { - cs_dsp_err(dsp, "Failed to complete async write: %d\n", ret); - goto out_fw; - } - if (pos > firmware->size) cs_dsp_warn(dsp, "%s.%d: %zu bytes at end of file\n", file, regions, pos - firmware->size); @@ -1638,7 +1632,6 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, cs_dsp_debugfs_save_wmfwname(dsp, file); out_fw: - regmap_async_complete(regmap); cs_dsp_buf_free(&buf_list); if (ret == -EOVERFLOW) @@ -2326,8 +2319,8 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware cs_dsp_dbg(dsp, "%s.%d: Writing %d bytes at %x\n", file, blocks, le32_to_cpu(blk->len), reg); - ret = regmap_raw_write_async(regmap, reg, buf->buf, - le32_to_cpu(blk->len)); + ret = regmap_raw_write(regmap, reg, buf->buf, + le32_to_cpu(blk->len)); if (ret != 0) { cs_dsp_err(dsp, "%s.%d: Failed to write to %x in %s: %d\n", @@ -2339,10 +2332,6 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware blocks++; } - ret = regmap_async_complete(regmap); - if (ret != 0) - cs_dsp_err(dsp, "Failed to complete async write: %d\n", ret); - if (pos > firmware->size) cs_dsp_warn(dsp, "%s.%d: %zu bytes at end of file\n", file, blocks, pos - firmware->size); @@ -2350,7 +2339,6 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware cs_dsp_debugfs_save_binname(dsp, file); out_fw: - regmap_async_complete(regmap); cs_dsp_buf_free(&buf_list); if (ret == -EOVERFLOW) @@ -2561,8 +2549,8 @@ static int cs_dsp_adsp2_enable_core(struct cs_dsp *dsp) { int ret; - ret = regmap_update_bits_async(dsp->regmap, dsp->base + ADSP2_CONTROL, - ADSP2_SYS_ENA, ADSP2_SYS_ENA); + ret = regmap_update_bits(dsp->regmap, dsp->base + ADSP2_CONTROL, + ADSP2_SYS_ENA, ADSP2_SYS_ENA); if (ret != 0) return ret; diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c index fa9c1c3bf168b..f0a63d09d3c49 100644 --- a/drivers/firmware/efi/cper-arm.c +++ b/drivers/firmware/efi/cper-arm.c @@ -311,7 +311,7 @@ void cper_print_proc_arm(const char *pfx, ctx_info = (struct cper_arm_ctx_info *)err_info; max_ctx_type = ARRAY_SIZE(arm_reg_ctx_strs) - 1; for (i = 0; i < proc->context_info_num; i++) { - int size = sizeof(*ctx_info) + ctx_info->size; + int size = ALIGN(sizeof(*ctx_info) + ctx_info->size, 16); printk("%sContext info structure %d:\n", pfx, i); if (len < size) { diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c index 438ed9eff6d01..3949d7b5e808f 100644 --- a/drivers/firmware/efi/cper-x86.c +++ b/drivers/firmware/efi/cper-x86.c @@ -325,7 +325,7 @@ void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc) ctx_info = (struct cper_ia_proc_ctx *)err_info; for (i = 0; i < VALID_PROC_CXT_INFO_NUM(proc->validation_bits); i++) { - int size = sizeof(*ctx_info) + ctx_info->reg_arr_size; + int size = ALIGN(sizeof(*ctx_info) + ctx_info->reg_arr_size, 16); int groupsize = 4; printk("%sContext Information Structure %d:\n", pfx, i); diff --git a/drivers/firmware/efi/mokvar-table.c b/drivers/firmware/efi/mokvar-table.c index 5ed0602c2f75f..208db29613c63 100644 --- a/drivers/firmware/efi/mokvar-table.c +++ b/drivers/firmware/efi/mokvar-table.c @@ -99,14 +99,13 @@ static struct kobject *mokvar_kobj; */ void __init efi_mokvar_table_init(void) { + struct efi_mokvar_table_entry __aligned(1) *mokvar_entry, *next_entry; efi_memory_desc_t md; void *va = NULL; unsigned long cur_offset = 0; unsigned long offset_limit; - unsigned long map_size = 0; unsigned long map_size_needed = 0; unsigned long size; - struct efi_mokvar_table_entry *mokvar_entry; int err; if (!efi_enabled(EFI_MEMMAP)) @@ -134,48 +133,46 @@ void __init efi_mokvar_table_init(void) */ err = -EINVAL; while (cur_offset + sizeof(*mokvar_entry) <= offset_limit) { - mokvar_entry = va + cur_offset; - map_size_needed = cur_offset + sizeof(*mokvar_entry); - if (map_size_needed > map_size) { - if (va) - early_memunmap(va, map_size); - /* - * Map a little more than the fixed size entry - * header, anticipating some data. It's safe to - * do so as long as we stay within current memory - * descriptor. - */ - map_size = min(map_size_needed + 2*EFI_PAGE_SIZE, - offset_limit); - va = early_memremap(efi.mokvar_table, map_size); - if (!va) { - pr_err("Failed to map EFI MOKvar config table pa=0x%lx, size=%lu.\n", - efi.mokvar_table, map_size); - return; - } - mokvar_entry = va + cur_offset; + if (va) + early_memunmap(va, sizeof(*mokvar_entry)); + va = early_memremap(efi.mokvar_table + cur_offset, sizeof(*mokvar_entry)); + if (!va) { + pr_err("Failed to map EFI MOKvar config table pa=0x%lx, size=%zu.\n", + efi.mokvar_table + cur_offset, sizeof(*mokvar_entry)); + return; } - + mokvar_entry = va; +next: /* Check for last sentinel entry */ if (mokvar_entry->name[0] == '\0') { if (mokvar_entry->data_size != 0) break; err = 0; + map_size_needed = cur_offset + sizeof(*mokvar_entry); break; } - /* Sanity check that the name is null terminated */ - size = strnlen(mokvar_entry->name, - sizeof(mokvar_entry->name)); - if (size >= sizeof(mokvar_entry->name)) - break; + /* Enforce that the name is NUL terminated */ + mokvar_entry->name[sizeof(mokvar_entry->name) - 1] = '\0'; /* Advance to the next entry */ - cur_offset = map_size_needed + mokvar_entry->data_size; + size = sizeof(*mokvar_entry) + mokvar_entry->data_size; + cur_offset += size; + + /* + * Don't bother remapping if the current entry header and the + * next one end on the same page. + */ + next_entry = (void *)((unsigned long)mokvar_entry + size); + if (((((unsigned long)(mokvar_entry + 1) - 1) ^ + ((unsigned long)(next_entry + 1) - 1)) & PAGE_MASK) == 0) { + mokvar_entry = next_entry; + goto next; + } } if (va) - early_memunmap(va, map_size); + early_memunmap(va, sizeof(*mokvar_entry)); if (err) { pr_err("EFI MOKvar config table is not valid\n"); return; diff --git a/drivers/firmware/imx/Kconfig b/drivers/firmware/imx/Kconfig index 907cd149c40a8..c964f4924359f 100644 --- a/drivers/firmware/imx/Kconfig +++ b/drivers/firmware/imx/Kconfig @@ -25,6 +25,7 @@ config IMX_SCU config IMX_SCMI_MISC_DRV tristate "IMX SCMI MISC Protocol driver" + depends on ARCH_MXC || COMPILE_TEST default y if ARCH_MXC help The System Controller Management Interface firmware (SCMI FW) is diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 65f41cc3eafcc..d668ddb2e81d3 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -119,10 +119,15 @@ static ssize_t new_device_store(struct device_driver *driver, const char *buf, struct platform_device *pdev; int res, id; + if (!try_module_get(THIS_MODULE)) + return -ENOENT; + /* kernfs guarantees string termination, so count + 1 is safe */ aggr = kzalloc(sizeof(*aggr) + count + 1, GFP_KERNEL); - if (!aggr) - return -ENOMEM; + if (!aggr) { + res = -ENOMEM; + goto put_module; + } memcpy(aggr->args, buf, count + 1); @@ -161,6 +166,7 @@ static ssize_t new_device_store(struct device_driver *driver, const char *buf, } aggr->pdev = pdev; + module_put(THIS_MODULE); return count; remove_table: @@ -175,6 +181,8 @@ static ssize_t new_device_store(struct device_driver *driver, const char *buf, kfree(aggr->lookups); free_ga: kfree(aggr); +put_module: + module_put(THIS_MODULE); return res; } @@ -203,13 +211,19 @@ static ssize_t delete_device_store(struct device_driver *driver, if (error) return error; + if (!try_module_get(THIS_MODULE)) + return -ENOENT; + mutex_lock(&gpio_aggregator_lock); aggr = idr_remove(&gpio_aggregator_idr, id); mutex_unlock(&gpio_aggregator_lock); - if (!aggr) + if (!aggr) { + module_put(THIS_MODULE); return -ENOENT; + } gpio_aggregator_free(aggr); + module_put(THIS_MODULE); return count; } static DRIVER_ATTR_WO(delete_device); diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c index 2ecee3269a0cc..a7a1cdf7ac66d 100644 --- a/drivers/gpio/gpio-rcar.c +++ b/drivers/gpio/gpio-rcar.c @@ -40,7 +40,7 @@ struct gpio_rcar_info { struct gpio_rcar_priv { void __iomem *base; - spinlock_t lock; + raw_spinlock_t lock; struct device *dev; struct gpio_chip gpio_chip; unsigned int irq_parent; @@ -123,7 +123,7 @@ static void gpio_rcar_config_interrupt_input_mode(struct gpio_rcar_priv *p, * "Setting Level-Sensitive Interrupt Input Mode" */ - spin_lock_irqsave(&p->lock, flags); + raw_spin_lock_irqsave(&p->lock, flags); /* Configure positive or negative logic in POSNEG */ gpio_rcar_modify_bit(p, POSNEG, hwirq, !active_high_rising_edge); @@ -142,7 +142,7 @@ static void gpio_rcar_config_interrupt_input_mode(struct gpio_rcar_priv *p, if (!level_trigger) gpio_rcar_write(p, INTCLR, BIT(hwirq)); - spin_unlock_irqrestore(&p->lock, flags); + raw_spin_unlock_irqrestore(&p->lock, flags); } static int gpio_rcar_irq_set_type(struct irq_data *d, unsigned int type) @@ -246,7 +246,7 @@ static void gpio_rcar_config_general_input_output_mode(struct gpio_chip *chip, * "Setting General Input Mode" */ - spin_lock_irqsave(&p->lock, flags); + raw_spin_lock_irqsave(&p->lock, flags); /* Configure positive logic in POSNEG */ gpio_rcar_modify_bit(p, POSNEG, gpio, false); @@ -261,7 +261,7 @@ static void gpio_rcar_config_general_input_output_mode(struct gpio_chip *chip, if (p->info.has_outdtsel && output) gpio_rcar_modify_bit(p, OUTDTSEL, gpio, false); - spin_unlock_irqrestore(&p->lock, flags); + raw_spin_unlock_irqrestore(&p->lock, flags); } static int gpio_rcar_request(struct gpio_chip *chip, unsigned offset) @@ -347,7 +347,7 @@ static int gpio_rcar_get_multiple(struct gpio_chip *chip, unsigned long *mask, return 0; } - spin_lock_irqsave(&p->lock, flags); + raw_spin_lock_irqsave(&p->lock, flags); outputs = gpio_rcar_read(p, INOUTSEL); m = outputs & bankmask; if (m) @@ -356,7 +356,7 @@ static int gpio_rcar_get_multiple(struct gpio_chip *chip, unsigned long *mask, m = ~outputs & bankmask; if (m) val |= gpio_rcar_read(p, INDT) & m; - spin_unlock_irqrestore(&p->lock, flags); + raw_spin_unlock_irqrestore(&p->lock, flags); bits[0] = val; return 0; @@ -367,9 +367,9 @@ static void gpio_rcar_set(struct gpio_chip *chip, unsigned offset, int value) struct gpio_rcar_priv *p = gpiochip_get_data(chip); unsigned long flags; - spin_lock_irqsave(&p->lock, flags); + raw_spin_lock_irqsave(&p->lock, flags); gpio_rcar_modify_bit(p, OUTDT, offset, value); - spin_unlock_irqrestore(&p->lock, flags); + raw_spin_unlock_irqrestore(&p->lock, flags); } static void gpio_rcar_set_multiple(struct gpio_chip *chip, unsigned long *mask, @@ -386,12 +386,12 @@ static void gpio_rcar_set_multiple(struct gpio_chip *chip, unsigned long *mask, if (!bankmask) return; - spin_lock_irqsave(&p->lock, flags); + raw_spin_lock_irqsave(&p->lock, flags); val = gpio_rcar_read(p, OUTDT); val &= ~bankmask; val |= (bankmask & bits[0]); gpio_rcar_write(p, OUTDT, val); - spin_unlock_irqrestore(&p->lock, flags); + raw_spin_unlock_irqrestore(&p->lock, flags); } static int gpio_rcar_direction_output(struct gpio_chip *chip, unsigned offset, @@ -468,7 +468,12 @@ static int gpio_rcar_parse_dt(struct gpio_rcar_priv *p, unsigned int *npins) p->info = *info; ret = of_parse_phandle_with_fixed_args(np, "gpio-ranges", 3, 0, &args); - *npins = ret == 0 ? args.args[2] : RCAR_MAX_GPIO_PER_BANK; + if (ret) { + *npins = RCAR_MAX_GPIO_PER_BANK; + } else { + *npins = args.args[2]; + of_node_put(args.np); + } if (*npins == 0 || *npins > RCAR_MAX_GPIO_PER_BANK) { dev_warn(p->dev, "Invalid number of gpio lines %u, using %u\n", @@ -505,7 +510,7 @@ static int gpio_rcar_probe(struct platform_device *pdev) return -ENOMEM; p->dev = dev; - spin_lock_init(&p->lock); + raw_spin_lock_init(&p->lock); /* Get device configuration from DT node */ ret = gpio_rcar_parse_dt(p, &npins); diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index c4f34a347cb6e..c36a9dbccd4dd 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -36,6 +36,7 @@ struct vf610_gpio_port { struct clk *clk_port; struct clk *clk_gpio; int irq; + spinlock_t lock; /* protect gpio direction registers */ }; #define GPIO_PDOR 0x00 @@ -124,6 +125,7 @@ static int vf610_gpio_direction_input(struct gpio_chip *chip, unsigned int gpio) u32 val; if (port->sdata->have_paddr) { + guard(spinlock_irqsave)(&port->lock); val = vf610_gpio_readl(port->gpio_base + GPIO_PDDR); val &= ~mask; vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR); @@ -142,6 +144,7 @@ static int vf610_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio vf610_gpio_set(chip, gpio, value); if (port->sdata->have_paddr) { + guard(spinlock_irqsave)(&port->lock); val = vf610_gpio_readl(port->gpio_base + GPIO_PDDR); val |= mask; vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR); @@ -297,6 +300,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) return -ENOMEM; port->sdata = device_get_match_data(dev); + spin_lock_init(&port->lock); dual_base = port->sdata->have_dual_base; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index ca2f58a2cd45e..8741600af7efb 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1057,8 +1057,19 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, desc->gdev = gdev; if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) { - assign_bit(FLAG_IS_OUT, - &desc->flags, !gc->get_direction(gc, desc_index)); + ret = gc->get_direction(gc, desc_index); + if (ret < 0) + /* + * FIXME: Bail-out here once all GPIO drivers + * are updated to not return errors in + * situations that can be considered normal + * operation. + */ + dev_warn(&gdev->dev, + "%s: get_direction failed: %d\n", + __func__, ret); + + assign_bit(FLAG_IS_OUT, &desc->flags, !ret); } else { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->direction_input); @@ -2701,7 +2712,7 @@ EXPORT_SYMBOL_GPL(gpiod_direction_input); int gpiod_direction_input_nonotify(struct gpio_desc *desc) { - int ret = 0; + int ret = 0, dir; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) @@ -2728,13 +2739,18 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) if (guard.gc->direction_input) { ret = guard.gc->direction_input(guard.gc, gpio_chip_hwgpio(desc)); - } else if (guard.gc->get_direction && - (guard.gc->get_direction(guard.gc, - gpio_chip_hwgpio(desc)) != 1)) { - gpiod_warn(desc, - "%s: missing direction_input() operation and line is output\n", - __func__); - return -EIO; + } else if (guard.gc->get_direction) { + dir = guard.gc->get_direction(guard.gc, + gpio_chip_hwgpio(desc)); + if (dir < 0) + return dir; + + if (dir != GPIO_LINE_DIRECTION_IN) { + gpiod_warn(desc, + "%s: missing direction_input() operation and line is output\n", + __func__); + return -EIO; + } } if (ret == 0) { clear_bit(FLAG_IS_OUT, &desc->flags); @@ -2748,7 +2764,7 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) { - int val = !!value, ret = 0; + int val = !!value, ret = 0, dir; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) @@ -2771,12 +2787,18 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) gpio_chip_hwgpio(desc), val); } else { /* Check that we are in output mode if we can */ - if (guard.gc->get_direction && - guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc))) { - gpiod_warn(desc, - "%s: missing direction_output() operation\n", - __func__); - return -EIO; + if (guard.gc->get_direction) { + dir = guard.gc->get_direction(guard.gc, + gpio_chip_hwgpio(desc)); + if (dir < 0) + return dir; + + if (dir != GPIO_LINE_DIRECTION_OUT) { + gpiod_warn(desc, + "%s: missing direction_output() operation\n", + __func__); + return -EIO; + } } /* * If we can't actively set the direction, we are some @@ -3129,6 +3151,8 @@ static int gpiod_get_raw_value_commit(const struct gpio_desc *desc) static int gpio_chip_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + lockdep_assert_held(&gc->gpiodev->srcu); + if (gc->get_multiple) return gc->get_multiple(gc, mask, bits); if (gc->get) { @@ -3159,6 +3183,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, struct gpio_array *array_info, unsigned long *value_bitmap) { + struct gpio_chip *gc; int ret, i = 0; /* @@ -3170,10 +3195,15 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) - WARN_ON(array_info->chip->can_sleep); + WARN_ON(array_info->gdev->can_sleep); + + guard(srcu)(&array_info->gdev->srcu); + gc = srcu_dereference(array_info->gdev->chip, + &array_info->gdev->srcu); + if (!gc) + return -ENODEV; - ret = gpio_chip_get_multiple(array_info->chip, - array_info->get_mask, + ret = gpio_chip_get_multiple(gc, array_info->get_mask, value_bitmap); if (ret) return ret; @@ -3454,6 +3484,8 @@ static void gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) static void gpio_chip_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + lockdep_assert_held(&gc->gpiodev->srcu); + if (gc->set_multiple) { gc->set_multiple(gc, mask, bits); } else { @@ -3471,6 +3503,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, struct gpio_array *array_info, unsigned long *value_bitmap) { + struct gpio_chip *gc; int i = 0; /* @@ -3482,14 +3515,19 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) - WARN_ON(array_info->chip->can_sleep); + WARN_ON(array_info->gdev->can_sleep); + + guard(srcu)(&array_info->gdev->srcu); + gc = srcu_dereference(array_info->gdev->chip, + &array_info->gdev->srcu); + if (!gc) + return -ENODEV; if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); - gpio_chip_set_multiple(array_info->chip, array_info->set_mask, - value_bitmap); + gpio_chip_set_multiple(gc, array_info->set_mask, value_bitmap); i = find_first_zero_bit(array_info->set_mask, array_size); if (i == array_size) @@ -4751,9 +4789,10 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, { struct gpio_desc *desc; struct gpio_descs *descs; + struct gpio_device *gdev; struct gpio_array *array_info = NULL; - struct gpio_chip *gc; int count, bitmap_size; + unsigned long dflags; size_t descs_size; count = gpiod_count(dev, con_id); @@ -4774,7 +4813,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, descs->desc[descs->ndescs] = desc; - gc = gpiod_to_chip(desc); + gdev = gpiod_to_gpio_device(desc); /* * If pin hardware number of array member 0 is also 0, select * its chip as a candidate for fast bitmap processing path. @@ -4782,8 +4821,8 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, if (descs->ndescs == 0 && gpio_chip_hwgpio(desc) == 0) { struct gpio_descs *array; - bitmap_size = BITS_TO_LONGS(gc->ngpio > count ? - gc->ngpio : count); + bitmap_size = BITS_TO_LONGS(gdev->ngpio > count ? + gdev->ngpio : count); array = krealloc(descs, descs_size + struct_size(array_info, invert_mask, 3 * bitmap_size), @@ -4803,7 +4842,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, array_info->desc = descs->desc; array_info->size = count; - array_info->chip = gc; + array_info->gdev = gdev; bitmap_set(array_info->get_mask, descs->ndescs, count - descs->ndescs); bitmap_set(array_info->set_mask, descs->ndescs, @@ -4816,7 +4855,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, continue; /* Unmark array members which don't belong to the 'fast' chip */ - if (array_info->chip != gc) { + if (array_info->gdev != gdev) { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } @@ -4839,9 +4878,10 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, array_info->set_mask); } } else { + dflags = READ_ONCE(desc->flags); /* Exclude open drain or open source from fast output */ - if (gpiochip_line_is_open_drain(gc, descs->ndescs) || - gpiochip_line_is_open_source(gc, descs->ndescs)) + if (test_bit(FLAG_OPEN_DRAIN, &dflags) || + test_bit(FLAG_OPEN_SOURCE, &dflags)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ @@ -4853,7 +4893,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, if (array_info) dev_dbg(dev, "GPIO array info: chip=%s, size=%d, get_mask=%lx, set_mask=%lx, invert_mask=%lx\n", - array_info->chip->label, array_info->size, + array_info->gdev->label, array_info->size, *array_info->get_mask, *array_info->set_mask, *array_info->invert_mask); return descs; diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 83690f72f7e5c..147156ec502b2 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -114,7 +114,7 @@ extern const char *const gpio_suffixes[]; * * @desc: Array of pointers to the GPIO descriptors * @size: Number of elements in desc - * @chip: Parent GPIO chip + * @gdev: Parent GPIO device * @get_mask: Get mask used in fastpath * @set_mask: Set mask used in fastpath * @invert_mask: Invert mask used in fastpath @@ -126,7 +126,7 @@ extern const char *const gpio_suffixes[]; struct gpio_array { struct gpio_desc **desc; unsigned int size; - struct gpio_chip *chip; + struct gpio_device *gdev; unsigned long *get_mask; unsigned long *set_mask; unsigned long invert_mask[]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d100bb7a137cd..018dfccd771ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1638,6 +1638,13 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + /* resizing on Dell G5 SE platforms causes problems with runtime pm */ + if ((amdgpu_runtime_pm != 0) && + adev->pdev->vendor == PCI_VENDOR_ID_ATI && + adev->pdev->device == 0x731f && + adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) + return 0; + /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) DRM_WARN("System can't access extended configuration space, please check!!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 784b03abb3a43..c1f35ded684e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1638,22 +1638,19 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, } mutex_lock(&adev->enforce_isolation_mutex); - for (i = 0; i < num_partitions; i++) { - if (adev->enforce_isolation[i] && !partition_values[i]) { + if (adev->enforce_isolation[i] && !partition_values[i]) /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, false); - } else if (!adev->enforce_isolation[i] && partition_values[i]) { + else if (!adev->enforce_isolation[i] && partition_values[i]) /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, true); - } adev->enforce_isolation[i] = partition_values[i]; } - mutex_unlock(&adev->enforce_isolation_mutex); + amdgpu_mes_update_enforce_isolation(adev); + return count; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 32b27a1658e78..709c11cbeabd8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1681,7 +1681,8 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) } /* Fix me -- node_id is used to identify the correct MES instances in the future */ -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) +static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, + uint32_t node_id, bool enable) { struct mes_misc_op_input op_input = {0}; int r; @@ -1703,6 +1704,23 @@ int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_i return r; } +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev) +{ + int i, r = 0; + + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) { + mutex_lock(&adev->enforce_isolation_mutex); + for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) { + if (adev->enforce_isolation[i]) + r |= amdgpu_mes_set_enforce_isolation(adev, i, true); + else + r |= amdgpu_mes_set_enforce_isolation(adev, i, false); + } + mutex_unlock(&adev->enforce_isolation_mutex); + } + return r; +} + #if defined(CONFIG_DEBUG_FS) static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 2df2444ee892c..e98ea7ede1bab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -534,6 +534,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable); +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); #endif /* __AMDGPU_MES_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 01ae2f88dec8c..262bd010a283d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2281,7 +2281,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; struct amdgpu_res_cursor cursor; u64 addr; - int r; + int r = 0; if (!adev->mman.buffer_funcs_enabled) return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index 03308261f8943..7507d94430287 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -188,8 +188,8 @@ static int amdgpu_vkms_crtc_init(struct drm_device *dev, struct drm_crtc *crtc, amdgpu_crtc->connector = NULL; amdgpu_crtc->vsync_timer_enabled = AMDGPU_IRQ_STATE_DISABLE; - hrtimer_init(&amdgpu_crtc->vblank_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - amdgpu_crtc->vblank_timer.function = &amdgpu_vkms_vblank_simulate; + hrtimer_setup(&amdgpu_crtc->vblank_timer, &amdgpu_vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 65f389eb65e5f..f9a4d08eef925 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1633,6 +1633,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) goto failure; } + r = amdgpu_mes_update_enforce_isolation(adev); + if (r) + goto failure; + out: /* * Disable KIQ ring usage from the driver once MES is enabled. diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 901e924e69ad9..0fd0fa6ed5184 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1743,6 +1743,10 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) goto failure; } + r = amdgpu_mes_update_enforce_isolation(adev); + if (r) + goto failure; + out: /* * Disable KIQ ring usage from the driver once MES is enabled. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 2eff37aaf8273..1695dd78ede8e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -107,6 +107,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -167,10 +169,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 68dbc0399c87a..3c0ae28c5923b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -154,6 +154,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -221,10 +223,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c index 2b72d5b4949b6..565858b9044d4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -121,6 +121,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -184,10 +186,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index ff417d5361c42..3014925d95ffc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -183,6 +183,9 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -245,7 +248,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index ecccd7adbab4d..24396a2c77bd0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -266,8 +266,8 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope /* EOP buffer is not required for all ASICs */ if (properties->eop_ring_buffer_address) { if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) { - pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n", - properties->eop_buf_bo->tbo.base.size, + pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n", + properties->eop_ring_buffer_size, topo_dev->node_props.eop_buffer_size); err = -EINVAL; goto out_err_unreserve; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ac3fd81fecef2..9d9645a2d18ef 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1618,75 +1618,130 @@ static bool dm_should_disable_stutter(struct pci_dev *pdev) return false; } -static const struct dmi_system_id hpd_disconnect_quirk_table[] = { +struct amdgpu_dm_quirks { + bool aux_hpd_discon; + bool support_edp0_on_dp1; +}; + +static struct amdgpu_dm_quirks quirk_entries = { + .aux_hpd_discon = false, + .support_edp0_on_dp1 = false +}; + +static int edp0_on_dp1_callback(const struct dmi_system_id *id) +{ + quirk_entries.support_edp0_on_dp1 = true; + return 0; +} + +static int aux_hpd_discon_callback(const struct dmi_system_id *id) +{ + quirk_entries.aux_hpd_discon = true; + return 0; +} + +static const struct dmi_system_id dmi_quirk_table[] = { { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3660"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3260"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3460"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Tower Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Tower 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex SFF Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex SFF 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Micro Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Micro 7010"), }, }, + { + .callback = edp0_on_dp1_callback, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Elite mt645 G8 Mobile Thin Client"), + }, + }, + { + .callback = edp0_on_dp1_callback, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook 665 16 inch G11 Notebook PC"), + }, + }, {} /* TODO: refactor this from a fixed table to a dynamic option */ }; -static void retrieve_dmi_info(struct amdgpu_display_manager *dm) +static void retrieve_dmi_info(struct amdgpu_display_manager *dm, struct dc_init_data *init_data) { - const struct dmi_system_id *dmi_id; + int dmi_id; + struct drm_device *dev = dm->ddev; dm->aux_hpd_discon_quirk = false; + init_data->flags.support_edp0_on_dp1 = false; + + dmi_id = dmi_check_system(dmi_quirk_table); + + if (!dmi_id) + return; - dmi_id = dmi_first_match(hpd_disconnect_quirk_table); - if (dmi_id) { + if (quirk_entries.aux_hpd_discon) { dm->aux_hpd_discon_quirk = true; - DRM_INFO("aux_hpd_discon_quirk attached\n"); + drm_info(dev, "aux_hpd_discon_quirk attached\n"); + } + if (quirk_entries.support_edp0_on_dp1) { + init_data->flags.support_edp0_on_dp1 = true; + drm_info(dev, "aux_hpd_discon_quirk attached\n"); } } @@ -1994,7 +2049,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0)) init_data.num_virtual_links = 1; - retrieve_dmi_info(&adev->dm); + retrieve_dmi_info(&adev->dm, &init_data); if (adev->dm.bb_from_dmub) init_data.bb_from_dmub = adev->dm.bb_from_dmub; @@ -7240,8 +7295,14 @@ static void amdgpu_dm_connector_funcs_force(struct drm_connector *connector) struct dc_link *dc_link = aconnector->dc_link; struct dc_sink *dc_em_sink = aconnector->dc_em_sink; const struct drm_edid *drm_edid; + struct i2c_adapter *ddc; + + if (dc_link && dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; - drm_edid = drm_edid_read(connector); + drm_edid = drm_edid_read_ddc(connector, ddc); drm_edid_connector_update(connector, drm_edid); if (!drm_edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); @@ -7286,14 +7347,21 @@ static int get_modes(struct drm_connector *connector) static void create_eml_sink(struct amdgpu_dm_connector *aconnector) { struct drm_connector *connector = &aconnector->base; + struct dc_link *dc_link = aconnector->dc_link; struct dc_sink_init_data init_params = { .link = aconnector->dc_link, .sink_signal = SIGNAL_TYPE_VIRTUAL }; const struct drm_edid *drm_edid; const struct edid *edid; + struct i2c_adapter *ddc; + + if (dc_link && dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; - drm_edid = drm_edid_read(connector); + drm_edid = drm_edid_read_ddc(connector, ddc); drm_edid_connector_update(connector, drm_edid); if (!drm_edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c index 3390f0d8420a0..c4a7fd453e5fc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c @@ -894,6 +894,7 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) struct drm_device *dev = adev_to_drm(adev); struct drm_connector *connector; struct drm_connector_list_iter iter; + int i; drm_connector_list_iter_begin(dev, &iter); drm_for_each_connector_iter(connector, &iter) { @@ -920,6 +921,12 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) } } drm_connector_list_iter_end(&iter); + + /* Update reference counts for HPDs */ + for (i = DC_IRQ_SOURCE_HPD1; i <= adev->mode_info.num_hpd; i++) { + if (amdgpu_irq_get(adev, &adev->hpd_irq, i - DC_IRQ_SOURCE_HPD1)) + drm_err(dev, "DM_IRQ: Failed get HPD for source=%d)!\n", i); + } } /** @@ -935,6 +942,7 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) struct drm_device *dev = adev_to_drm(adev); struct drm_connector *connector; struct drm_connector_list_iter iter; + int i; drm_connector_list_iter_begin(dev, &iter); drm_for_each_connector_iter(connector, &iter) { @@ -960,4 +968,10 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) } } drm_connector_list_iter_end(&iter); + + /* Update reference counts for HPDs */ + for (i = DC_IRQ_SOURCE_HPD1; i <= adev->mode_info.num_hpd; i++) { + if (amdgpu_irq_put(adev, &adev->hpd_irq, i - DC_IRQ_SOURCE_HPD1)) + drm_err(dev, "DM_IRQ: Failed put HPD for source=%d!\n", i); + } } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c index 45858bf1523d8..e140b7a04d724 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c @@ -54,7 +54,8 @@ static bool link_supports_psrsu(struct dc_link *link) if (amdgpu_dc_debug_mask & DC_DISABLE_PSR_SU) return false; - return dc_dmub_check_min_version(dc->ctx->dmub_srv->dmub); + /* Temporarily disable PSR-SU to avoid glitches */ + return false; } /* diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 520a34a42827b..a45037cb4cc01 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -1455,7 +1455,8 @@ bool resource_build_scaling_params(struct pipe_ctx *pipe_ctx) DC_LOGGER_INIT(pipe_ctx->stream->ctx->logger); /* Invalid input */ - if (!plane_state->dst_rect.width || + if (!plane_state || + !plane_state->dst_rect.width || !plane_state->dst_rect.height || !plane_state->src_rect.width || !plane_state->src_rect.height) { diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index 67a8e22b1126d..e237ea1185a71 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -3042,6 +3042,7 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); if (ret) @@ -3049,6 +3050,8 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); + return ret; } @@ -3066,32 +3069,42 @@ static int kv_dpm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + cancel_work_sync(&adev->pm.dpm.thermal.work); + if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ kv_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } return 0; } static int kv_dpm_resume(struct amdgpu_ip_block *ip_block) { - int ret; + int ret = 0; struct amdgpu_device *adev = ip_block->adev; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); /* asic init will reset to the boot state */ kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); - if (ret) + if (ret) { adev->pm.dpm_enabled = false; - else + } else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + return ret; } static bool kv_dpm_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c index e861355ebd75b..c7518b13e7879 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c @@ -1009,9 +1009,12 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) enum amd_pm_state_type dpm_state = POWER_STATE_TYPE_INTERNAL_THERMAL; int temp, size = sizeof(temp); - if (!adev->pm.dpm_enabled) - return; + mutex_lock(&adev->pm.mutex); + if (!adev->pm.dpm_enabled) { + mutex_unlock(&adev->pm.mutex); + return; + } if (!pp_funcs->read_sensor(adev->powerplay.pp_handle, AMDGPU_PP_SENSOR_GPU_TEMP, (void *)&temp, @@ -1033,4 +1036,5 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) adev->pm.dpm.state = dpm_state; amdgpu_legacy_dpm_compute_clocks(adev->powerplay.pp_handle); + mutex_unlock(&adev->pm.mutex); } diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c index a87dcf0974bc1..d6dfe2599ebea 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c @@ -7786,6 +7786,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); if (ret) @@ -7793,6 +7794,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); return ret; } @@ -7810,32 +7812,44 @@ static int si_dpm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + cancel_work_sync(&adev->pm.dpm.thermal.work); + if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ si_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } + return 0; } static int si_dpm_resume(struct amdgpu_ip_block *ip_block) { - int ret; + int ret = 0; struct amdgpu_device *adev = ip_block->adev; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { /* asic init will reset to the boot state */ + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); - if (ret) + if (ret) { adev->pm.dpm_enabled = false; - else + } else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + + return ret; } static bool si_dpm_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c index 9b2f4fe1578b8..ddb6444406d28 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0.c @@ -1895,16 +1895,6 @@ static int smu_v14_0_allow_ih_interrupt(struct smu_context *smu) NULL); } -static int smu_v14_0_process_pending_interrupt(struct smu_context *smu) -{ - int ret = 0; - - if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_ACDC_BIT)) - ret = smu_v14_0_allow_ih_interrupt(smu); - - return ret; -} - int smu_v14_0_enable_thermal_alert(struct smu_context *smu) { int ret = 0; @@ -1916,7 +1906,7 @@ int smu_v14_0_enable_thermal_alert(struct smu_context *smu) if (ret) return ret; - return smu_v14_0_process_pending_interrupt(smu); + return smu_v14_0_allow_ih_interrupt(smu); } int smu_v14_0_disable_thermal_alert(struct smu_context *smu) diff --git a/drivers/gpu/drm/drm_fbdev_dma.c b/drivers/gpu/drm/drm_fbdev_dma.c index b14b581c059d3..02a516e771927 100644 --- a/drivers/gpu/drm/drm_fbdev_dma.c +++ b/drivers/gpu/drm/drm_fbdev_dma.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #include +#include #include #include @@ -70,37 +71,102 @@ static const struct fb_ops drm_fbdev_dma_fb_ops = { .fb_destroy = drm_fbdev_dma_fb_destroy, }; -FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(drm_fbdev_dma, +FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(drm_fbdev_dma_shadowed, drm_fb_helper_damage_range, drm_fb_helper_damage_area); -static int drm_fbdev_dma_deferred_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) +static void drm_fbdev_dma_shadowed_fb_destroy(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; - struct drm_framebuffer *fb = fb_helper->fb; - struct drm_gem_dma_object *dma = drm_fb_dma_get_gem_obj(fb, 0); + void *shadow = info->screen_buffer; + + if (!fb_helper->dev) + return; - if (!dma->map_noncoherent) - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (info->fbdefio) + fb_deferred_io_cleanup(info); + drm_fb_helper_fini(fb_helper); + vfree(shadow); - return fb_deferred_io_mmap(info, vma); + drm_client_buffer_vunmap(fb_helper->buffer); + drm_client_framebuffer_delete(fb_helper->buffer); + drm_client_release(&fb_helper->client); + drm_fb_helper_unprepare(fb_helper); + kfree(fb_helper); } -static const struct fb_ops drm_fbdev_dma_deferred_fb_ops = { +static const struct fb_ops drm_fbdev_dma_shadowed_fb_ops = { .owner = THIS_MODULE, .fb_open = drm_fbdev_dma_fb_open, .fb_release = drm_fbdev_dma_fb_release, - __FB_DEFAULT_DEFERRED_OPS_RDWR(drm_fbdev_dma), + FB_DEFAULT_DEFERRED_OPS(drm_fbdev_dma_shadowed), DRM_FB_HELPER_DEFAULT_OPS, - __FB_DEFAULT_DEFERRED_OPS_DRAW(drm_fbdev_dma), - .fb_mmap = drm_fbdev_dma_deferred_fb_mmap, - .fb_destroy = drm_fbdev_dma_fb_destroy, + .fb_destroy = drm_fbdev_dma_shadowed_fb_destroy, }; /* * struct drm_fb_helper */ +static void drm_fbdev_dma_damage_blit_real(struct drm_fb_helper *fb_helper, + struct drm_clip_rect *clip, + struct iosys_map *dst) +{ + struct drm_framebuffer *fb = fb_helper->fb; + size_t offset = clip->y1 * fb->pitches[0]; + size_t len = clip->x2 - clip->x1; + unsigned int y; + void *src; + + switch (drm_format_info_bpp(fb->format, 0)) { + case 1: + offset += clip->x1 / 8; + len = DIV_ROUND_UP(len + clip->x1 % 8, 8); + break; + case 2: + offset += clip->x1 / 4; + len = DIV_ROUND_UP(len + clip->x1 % 4, 4); + break; + case 4: + offset += clip->x1 / 2; + len = DIV_ROUND_UP(len + clip->x1 % 2, 2); + break; + default: + offset += clip->x1 * fb->format->cpp[0]; + len *= fb->format->cpp[0]; + break; + } + + src = fb_helper->info->screen_buffer + offset; + iosys_map_incr(dst, offset); /* go to first pixel within clip rect */ + + for (y = clip->y1; y < clip->y2; y++) { + iosys_map_memcpy_to(dst, 0, src, len); + iosys_map_incr(dst, fb->pitches[0]); + src += fb->pitches[0]; + } +} + +static int drm_fbdev_dma_damage_blit(struct drm_fb_helper *fb_helper, + struct drm_clip_rect *clip) +{ + struct drm_client_buffer *buffer = fb_helper->buffer; + struct iosys_map dst; + + /* + * For fbdev emulation, we only have to protect against fbdev modeset + * operations. Nothing else will involve the client buffer's BO. So it + * is sufficient to acquire struct drm_fb_helper.lock here. + */ + mutex_lock(&fb_helper->lock); + + dst = buffer->map; + drm_fbdev_dma_damage_blit_real(fb_helper, clip, &dst); + + mutex_unlock(&fb_helper->lock); + + return 0; +} static int drm_fbdev_dma_helper_fb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) { @@ -112,6 +178,10 @@ static int drm_fbdev_dma_helper_fb_dirty(struct drm_fb_helper *helper, return 0; if (helper->fb->funcs->dirty) { + ret = drm_fbdev_dma_damage_blit(helper, clip); + if (drm_WARN_ONCE(dev, ret, "Damage blitter failed: ret=%d\n", ret)) + return ret; + ret = helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); if (drm_WARN_ONCE(dev, ret, "Dirty helper failed: ret=%d\n", ret)) return ret; @@ -128,14 +198,80 @@ static const struct drm_fb_helper_funcs drm_fbdev_dma_helper_funcs = { * struct drm_fb_helper */ +static int drm_fbdev_dma_driver_fbdev_probe_tail(struct drm_fb_helper *fb_helper, + struct drm_fb_helper_surface_size *sizes) +{ + struct drm_device *dev = fb_helper->dev; + struct drm_client_buffer *buffer = fb_helper->buffer; + struct drm_gem_dma_object *dma_obj = to_drm_gem_dma_obj(buffer->gem); + struct drm_framebuffer *fb = fb_helper->fb; + struct fb_info *info = fb_helper->info; + struct iosys_map map = buffer->map; + + info->fbops = &drm_fbdev_dma_fb_ops; + + /* screen */ + info->flags |= FBINFO_VIRTFB; /* system memory */ + if (dma_obj->map_noncoherent) + info->flags |= FBINFO_READS_FAST; /* signal caching */ + info->screen_size = sizes->surface_height * fb->pitches[0]; + info->screen_buffer = map.vaddr; + if (!(info->flags & FBINFO_HIDE_SMEM_START)) { + if (!drm_WARN_ON(dev, is_vmalloc_addr(info->screen_buffer))) + info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); + } + info->fix.smem_len = info->screen_size; + + return 0; +} + +static int drm_fbdev_dma_driver_fbdev_probe_tail_shadowed(struct drm_fb_helper *fb_helper, + struct drm_fb_helper_surface_size *sizes) +{ + struct drm_client_buffer *buffer = fb_helper->buffer; + struct fb_info *info = fb_helper->info; + size_t screen_size = buffer->gem->size; + void *screen_buffer; + int ret; + + /* + * Deferred I/O requires struct page for framebuffer memory, + * which is not guaranteed for all DMA ranges. We thus create + * a shadow buffer in system memory. + */ + screen_buffer = vzalloc(screen_size); + if (!screen_buffer) + return -ENOMEM; + + info->fbops = &drm_fbdev_dma_shadowed_fb_ops; + + /* screen */ + info->flags |= FBINFO_VIRTFB; /* system memory */ + info->flags |= FBINFO_READS_FAST; /* signal caching */ + info->screen_buffer = screen_buffer; + info->fix.smem_len = screen_size; + + fb_helper->fbdefio.delay = HZ / 20; + fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; + + info->fbdefio = &fb_helper->fbdefio; + ret = fb_deferred_io_init(info); + if (ret) + goto err_vfree; + + return 0; + +err_vfree: + vfree(screen_buffer); + return ret; +} + int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; - bool use_deferred_io = false; struct drm_client_buffer *buffer; - struct drm_gem_dma_object *dma_obj; struct drm_framebuffer *fb; struct fb_info *info; u32 format; @@ -152,19 +288,9 @@ int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, sizes->surface_height, format); if (IS_ERR(buffer)) return PTR_ERR(buffer); - dma_obj = to_drm_gem_dma_obj(buffer->gem); fb = buffer->fb; - /* - * Deferred I/O requires struct page for framebuffer memory, - * which is not guaranteed for all DMA ranges. We thus only - * install deferred I/O if we have a framebuffer that requires - * it. - */ - if (fb->funcs->dirty) - use_deferred_io = true; - ret = drm_client_buffer_vmap(buffer, &map); if (ret) { goto err_drm_client_buffer_delete; @@ -185,45 +311,12 @@ int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, drm_fb_helper_fill_info(info, fb_helper, sizes); - if (use_deferred_io) - info->fbops = &drm_fbdev_dma_deferred_fb_ops; + if (fb->funcs->dirty) + ret = drm_fbdev_dma_driver_fbdev_probe_tail_shadowed(fb_helper, sizes); else - info->fbops = &drm_fbdev_dma_fb_ops; - - /* screen */ - info->flags |= FBINFO_VIRTFB; /* system memory */ - if (dma_obj->map_noncoherent) - info->flags |= FBINFO_READS_FAST; /* signal caching */ - info->screen_size = sizes->surface_height * fb->pitches[0]; - info->screen_buffer = map.vaddr; - if (!(info->flags & FBINFO_HIDE_SMEM_START)) { - if (!drm_WARN_ON(dev, is_vmalloc_addr(info->screen_buffer))) - info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); - } - info->fix.smem_len = info->screen_size; - - /* - * Only set up deferred I/O if the screen buffer supports - * it. If this disagrees with the previous test for ->dirty, - * mmap on the /dev/fb file might not work correctly. - */ - if (!is_vmalloc_addr(info->screen_buffer) && info->fix.smem_start) { - unsigned long pfn = info->fix.smem_start >> PAGE_SHIFT; - - if (drm_WARN_ON(dev, !pfn_to_page(pfn))) - use_deferred_io = false; - } - - /* deferred I/O */ - if (use_deferred_io) { - fb_helper->fbdefio.delay = HZ / 20; - fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; - - info->fbdefio = &fb_helper->fbdefio; - ret = fb_deferred_io_init(info); - if (ret) - goto err_drm_fb_helper_release_info; - } + ret = drm_fbdev_dma_driver_fbdev_probe_tail(fb_helper, sizes); + if (ret) + goto err_drm_fb_helper_release_info; return 0; diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index c977b74f82f0b..82bf6c654de25 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -809,8 +809,8 @@ gen11_dsi_configure_transcoder(struct intel_encoder *encoder, /* select data lane width */ tmp = intel_de_read(display, TRANS_DDI_FUNC_CTL(display, dsi_trans)); - tmp &= ~DDI_PORT_WIDTH_MASK; - tmp |= DDI_PORT_WIDTH(intel_dsi->lane_count); + tmp &= ~TRANS_DDI_PORT_WIDTH_MASK; + tmp |= TRANS_DDI_PORT_WIDTH(intel_dsi->lane_count); /* select input pipe */ tmp &= ~TRANS_DDI_EDP_INPUT_MASK; diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index acb986bc1f33a..ff2cf3daa7a2b 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -658,7 +658,6 @@ void intel_ddi_disable_transcoder_func(const struct intel_crtc_state *crtc_state struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); enum transcoder cpu_transcoder = crtc_state->cpu_transcoder; - bool is_mst = intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DP_MST); u32 ctl; if (DISPLAY_VER(dev_priv) >= 11) @@ -678,8 +677,7 @@ void intel_ddi_disable_transcoder_func(const struct intel_crtc_state *crtc_state TRANS_DDI_PORT_SYNC_MASTER_SELECT_MASK); if (DISPLAY_VER(dev_priv) >= 12) { - if (!intel_dp_mst_is_master_trans(crtc_state) || - (!is_mst && intel_dp_is_uhbr(crtc_state))) { + if (!intel_dp_mst_is_master_trans(crtc_state)) { ctl &= ~(TGL_TRANS_DDI_PORT_MASK | TRANS_DDI_MODE_SELECT_MASK); } @@ -868,7 +866,7 @@ static void intel_ddi_get_encoder_pipes(struct intel_encoder *encoder, encoder->base.base.id, encoder->base.name); if (!mst_pipe_mask && dp128b132b_pipe_mask) { - struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + struct intel_digital_port *dig_port = enc_to_dig_port(encoder); /* * If we don't have 8b/10b MST, but have more than one @@ -880,7 +878,8 @@ static void intel_ddi_get_encoder_pipes(struct intel_encoder *encoder, * we don't expect MST to have been enabled at that point, and * can assume it's SST. */ - if (hweight8(dp128b132b_pipe_mask) > 1 || intel_dp->is_mst) + if (hweight8(dp128b132b_pipe_mask) > 1 || + intel_dp_mst_encoder_active_links(dig_port)) mst_pipe_mask = dp128b132b_pipe_mask; } @@ -3134,7 +3133,7 @@ static void intel_ddi_post_disable_dp(struct intel_atomic_state *state, intel_dp_set_power(intel_dp, DP_SET_POWER_D3); if (DISPLAY_VER(dev_priv) >= 12) { - if (is_mst) { + if (is_mst || intel_dp_is_uhbr(old_crtc_state)) { enum transcoder cpu_transcoder = old_crtc_state->cpu_transcoder; intel_de_rmw(dev_priv, @@ -3487,7 +3486,7 @@ static void intel_ddi_enable_hdmi(struct intel_atomic_state *state, intel_de_rmw(dev_priv, XELPDP_PORT_BUF_CTL1(dev_priv, port), XELPDP_PORT_WIDTH_MASK | XELPDP_PORT_REVERSAL, port_buf); - buf_ctl |= DDI_PORT_WIDTH(lane_count); + buf_ctl |= DDI_PORT_WIDTH(crtc_state->lane_count); if (DISPLAY_VER(dev_priv) >= 20) buf_ctl |= XE2LPD_DDI_BUF_D2D_LINK_ENABLE; @@ -4153,13 +4152,13 @@ static void intel_ddi_read_func_ctl(struct intel_encoder *encoder, } else if (ddi_mode == TRANS_DDI_MODE_SELECT_DP_MST) { intel_ddi_read_func_ctl_dp_mst(encoder, pipe_config, ddi_func_ctl); } else if (ddi_mode == TRANS_DDI_MODE_SELECT_FDI_OR_128B132B && HAS_DP20(display)) { - struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + struct intel_digital_port *dig_port = enc_to_dig_port(encoder); /* * If this is true, we know we're being called from mst stream * encoder's ->get_config(). */ - if (intel_dp->is_mst) + if (intel_dp_mst_encoder_active_links(dig_port)) intel_ddi_read_func_ctl_dp_mst(encoder, pipe_config, ddi_func_ctl); else intel_ddi_read_func_ctl_dp_sst(encoder, pipe_config, ddi_func_ctl); diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 4271da219b410..41128469f12a2 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -6628,12 +6628,30 @@ static int intel_async_flip_check_hw(struct intel_atomic_state *state, struct in static int intel_joiner_add_affected_crtcs(struct intel_atomic_state *state) { struct drm_i915_private *i915 = to_i915(state->base.dev); + const struct intel_plane_state *plane_state; struct intel_crtc_state *crtc_state; + struct intel_plane *plane; struct intel_crtc *crtc; u8 affected_pipes = 0; u8 modeset_pipes = 0; int i; + /* + * Any plane which is in use by the joiner needs its crtc. + * Pull those in first as this will not have happened yet + * if the plane remains disabled according to uapi. + */ + for_each_new_intel_plane_in_state(state, plane, plane_state, i) { + crtc = to_intel_crtc(plane_state->hw.crtc); + if (!crtc) + continue; + + crtc_state = intel_atomic_get_crtc_state(&state->base, crtc); + if (IS_ERR(crtc_state)) + return PTR_ERR(crtc_state); + } + + /* Now pull in all joined crtcs */ for_each_new_intel_crtc_in_state(state, crtc, crtc_state, i) { affected_pipes |= crtc_state->joiner_pipes; if (intel_crtc_needs_modeset(crtc_state)) diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c index 8b1977cfec503..6696a32cdd3e6 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c +++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c @@ -1563,7 +1563,7 @@ intel_dp_128b132b_link_train(struct intel_dp *intel_dp, if (wait_for(intel_dp_128b132b_intra_hop(intel_dp, crtc_state) == 0, 500)) { lt_err(intel_dp, DP_PHY_DPRX, "128b/132b intra-hop not clear\n"); - return false; + goto out; } if (intel_dp_128b132b_lane_eq(intel_dp, crtc_state) && @@ -1575,6 +1575,19 @@ intel_dp_128b132b_link_train(struct intel_dp *intel_dp, passed ? "passed" : "failed", crtc_state->port_clock, crtc_state->lane_count); +out: + /* + * Ensure that the training pattern does get set to TPS2 even in case + * of a failure, as is the case at the end of a passing link training + * and what is expected by the transcoder. Leaving TPS1 set (and + * disabling the link train mode in DP_TP_CTL later from TPS1 directly) + * would result in a stuck transcoder HW state and flip-done timeouts + * later in the modeset sequence. + */ + if (!passed) + intel_dp_program_link_training_pattern(intel_dp, crtc_state, + DP_PHY_DPRX, DP_TRAINING_PATTERN_2); + return passed; } diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index a65cf97ad12df..86d6185fda50a 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -1867,7 +1867,8 @@ intel_dp_mst_encoder_init(struct intel_digital_port *dig_port, int conn_base_id) /* create encoders */ mst_stream_encoders_create(dig_port); ret = drm_dp_mst_topology_mgr_init(&intel_dp->mst_mgr, display->drm, - &intel_dp->aux, 16, 3, conn_base_id); + &intel_dp->aux, 16, + INTEL_NUM_PIPES(display), conn_base_id); if (ret) { intel_dp->mst_mgr.cbs = NULL; return ret; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index cc05bd9e43b49..3fce5c0001444 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -3449,10 +3449,10 @@ static inline int guc_lrc_desc_unpin(struct intel_context *ce) */ ret = deregister_context(ce, ce->guc_id.id); if (ret) { - spin_lock(&ce->guc_state.lock); + spin_lock_irqsave(&ce->guc_state.lock, flags); set_context_registered(ce); clr_context_destroyed(ce); - spin_unlock(&ce->guc_state.lock); + spin_unlock_irqrestore(&ce->guc_state.lock, flags); /* * As gt-pm is awake at function entry, intel_wakeref_put_async merely decrements * the wakeref immediately but per function spec usage call this after unlock. diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c index b3cbf85c00cbd..00d00c480cc58 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c @@ -231,8 +231,8 @@ static void delayed_huc_load_init(struct intel_huc *huc) sw_fence_dummy_notify); i915_sw_fence_commit(&huc->delayed_load.fence); - hrtimer_init(&huc->delayed_load.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - huc->delayed_load.timer.function = huc_delayed_load_timer_callback; + hrtimer_setup(&huc->delayed_load.timer, huc_delayed_load_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static void delayed_huc_load_fini(struct intel_huc *huc) diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c index 95570cabdf276..f668cd9487f1e 100644 --- a/drivers/gpu/drm/i915/gvt/display.c +++ b/drivers/gpu/drm/i915/gvt/display.c @@ -581,8 +581,7 @@ static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num, vgpu->display.port_num = port_num; /* Init hrtimer based on default refresh rate */ - hrtimer_init(&vblank_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - vblank_timer->timer.function = vblank_timer_fn; + hrtimer_setup(&vblank_timer->timer, vblank_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); vblank_timer->vrefresh_k = port->vrefresh_k; vblank_timer->period = DIV64_U64_ROUND_CLOSEST(NSEC_PER_SEC * MSEC_PER_SEC, vblank_timer->vrefresh_k); diff --git a/drivers/gpu/drm/i915/gvt/sched_policy.c b/drivers/gpu/drm/i915/gvt/sched_policy.c index c077fb4674f0f..9f97f743aa71a 100644 --- a/drivers/gpu/drm/i915/gvt/sched_policy.c +++ b/drivers/gpu/drm/i915/gvt/sched_policy.c @@ -286,8 +286,7 @@ static int tbs_sched_init(struct intel_gvt *gvt) return -ENOMEM; INIT_LIST_HEAD(&data->lru_runq_head); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - data->timer.function = tbs_timer_fn; + hrtimer_setup(&data->timer, tbs_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); data->period = GVT_DEFAULT_TIME_SLICE; data->gvt = gvt; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 5384d1bb49233..279e266b4b06c 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3359,9 +3359,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, "opening stream oa config uuid=%s\n", stream->oa_config->uuid); - hrtimer_init(&stream->poll_check_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - stream->poll_check_timer.function = oa_poll_check_timer_cb; + hrtimer_setup(&stream->poll_check_timer, oa_poll_check_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); mutex_init(&stream->lock); diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index e55db036be1bb..0ce87f188d116 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -1264,8 +1264,7 @@ void i915_pmu_register(struct drm_i915_private *i915) int ret = -ENOMEM; spin_lock_init(&pmu->lock); - hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pmu->timer.function = i915_sample; + hrtimer_setup(&pmu->timer, i915_sample, CLOCK_MONOTONIC, HRTIMER_MODE_REL); pmu->cpuhp.cpu = -1; init_rc6(pmu); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 765e6c0528fb0..786c727aea454 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3633,7 +3633,7 @@ enum skl_power_gate { #define DDI_BUF_IS_IDLE (1 << 7) #define DDI_BUF_CTL_TC_PHY_OWNERSHIP REG_BIT(6) #define DDI_A_4_LANES (1 << 4) -#define DDI_PORT_WIDTH(width) (((width) - 1) << 1) +#define DDI_PORT_WIDTH(width) (((width) == 3 ? 4 : ((width) - 1)) << 1) #define DDI_PORT_WIDTH_MASK (7 << 1) #define DDI_PORT_WIDTH_SHIFT 1 #define DDI_INIT_DISPLAY_DETECTED (1 << 0) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 8f62cfa23fb76..ea0b8e7e4828a 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -293,8 +293,7 @@ static void __rq_init_watchdog(struct i915_request *rq) { struct i915_request_watchdog *wdg = &rq->watchdog; - hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wdg->timer.function = __rq_watchdog_expired; + hrtimer_setup(&wdg->timer, __rq_watchdog_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void __rq_arm_watchdog(struct i915_request *rq) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index eed4937c3ff30..bdcfcae83b52b 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -2103,8 +2103,7 @@ static int __fw_domain_init(struct intel_uncore *uncore, d->mask = BIT(domain_id); - hrtimer_init(&d->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - d->timer.function = intel_uncore_fw_release_timer; + hrtimer_setup(&d->timer, intel_uncore_fw_release_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); uncore->fw_domains |= BIT(domain_id); diff --git a/drivers/gpu/drm/imagination/Makefile b/drivers/gpu/drm/imagination/Makefile index 9bc6a3884c223..3d9d4d40fb806 100644 --- a/drivers/gpu/drm/imagination/Makefile +++ b/drivers/gpu/drm/imagination/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only OR MIT # Copyright (c) 2023 Imagination Technologies Ltd. -subdir-ccflags-y := -I$(src) - powervr-y := \ pvr_ccb.o \ pvr_cccb.o \ diff --git a/drivers/gpu/drm/imagination/pvr_fw_meta.c b/drivers/gpu/drm/imagination/pvr_fw_meta.c index c39beb70c3173..6d13864851fc2 100644 --- a/drivers/gpu/drm/imagination/pvr_fw_meta.c +++ b/drivers/gpu/drm/imagination/pvr_fw_meta.c @@ -527,8 +527,10 @@ pvr_meta_vm_map(struct pvr_device *pvr_dev, struct pvr_fw_object *fw_obj) static void pvr_meta_vm_unmap(struct pvr_device *pvr_dev, struct pvr_fw_object *fw_obj) { - pvr_vm_unmap(pvr_dev->kernel_vm_ctx, fw_obj->fw_mm_node.start, - fw_obj->fw_mm_node.size); + struct pvr_gem_object *pvr_obj = fw_obj->gem; + + pvr_vm_unmap_obj(pvr_dev->kernel_vm_ctx, pvr_obj, + fw_obj->fw_mm_node.start, fw_obj->fw_mm_node.size); } static bool diff --git a/drivers/gpu/drm/imagination/pvr_fw_trace.c b/drivers/gpu/drm/imagination/pvr_fw_trace.c index 73707daa4e52d..5dbb636d7d4ff 100644 --- a/drivers/gpu/drm/imagination/pvr_fw_trace.c +++ b/drivers/gpu/drm/imagination/pvr_fw_trace.c @@ -333,8 +333,8 @@ static int fw_trace_seq_show(struct seq_file *s, void *v) if (sf_id == ROGUE_FW_SF_LAST) return -EINVAL; - timestamp = read_fw_trace(trace_seq_data, 1) | - ((u64)read_fw_trace(trace_seq_data, 2) << 32); + timestamp = ((u64)read_fw_trace(trace_seq_data, 1) << 32) | + read_fw_trace(trace_seq_data, 2); timestamp = (timestamp & ~ROGUE_FWT_TIMESTAMP_TIME_CLRMSK) >> ROGUE_FWT_TIMESTAMP_TIME_SHIFT; diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c index c4f08432882b1..43411be930a21 100644 --- a/drivers/gpu/drm/imagination/pvr_queue.c +++ b/drivers/gpu/drm/imagination/pvr_queue.c @@ -109,12 +109,20 @@ pvr_queue_fence_get_driver_name(struct dma_fence *f) return PVR_DRIVER_NAME; } +static void pvr_queue_fence_release_work(struct work_struct *w) +{ + struct pvr_queue_fence *fence = container_of(w, struct pvr_queue_fence, release_work); + + pvr_context_put(fence->queue->ctx); + dma_fence_free(&fence->base); +} + static void pvr_queue_fence_release(struct dma_fence *f) { struct pvr_queue_fence *fence = container_of(f, struct pvr_queue_fence, base); + struct pvr_device *pvr_dev = fence->queue->ctx->pvr_dev; - pvr_context_put(fence->queue->ctx); - dma_fence_free(f); + queue_work(pvr_dev->sched_wq, &fence->release_work); } static const char * @@ -268,6 +276,7 @@ pvr_queue_fence_init(struct dma_fence *f, pvr_context_get(queue->ctx); fence->queue = queue; + INIT_WORK(&fence->release_work, pvr_queue_fence_release_work); dma_fence_init(&fence->base, fence_ops, &fence_ctx->lock, fence_ctx->id, atomic_inc_return(&fence_ctx->seqno)); @@ -304,8 +313,9 @@ pvr_queue_cccb_fence_init(struct dma_fence *fence, struct pvr_queue *queue) static void pvr_queue_job_fence_init(struct dma_fence *fence, struct pvr_queue *queue) { - pvr_queue_fence_init(fence, queue, &pvr_queue_job_fence_ops, - &queue->job_fence_ctx); + if (!fence->ops) + pvr_queue_fence_init(fence, queue, &pvr_queue_job_fence_ops, + &queue->job_fence_ctx); } /** diff --git a/drivers/gpu/drm/imagination/pvr_queue.h b/drivers/gpu/drm/imagination/pvr_queue.h index e06ced69302fc..93fe9ac9f58cc 100644 --- a/drivers/gpu/drm/imagination/pvr_queue.h +++ b/drivers/gpu/drm/imagination/pvr_queue.h @@ -5,6 +5,7 @@ #define PVR_QUEUE_H #include +#include #include "pvr_cccb.h" #include "pvr_device.h" @@ -63,6 +64,9 @@ struct pvr_queue_fence { /** @queue: Queue that created this fence. */ struct pvr_queue *queue; + + /** @release_work: Fence release work structure. */ + struct work_struct release_work; }; /** diff --git a/drivers/gpu/drm/imagination/pvr_vm.c b/drivers/gpu/drm/imagination/pvr_vm.c index 363f885a70982..2896fa7501b1c 100644 --- a/drivers/gpu/drm/imagination/pvr_vm.c +++ b/drivers/gpu/drm/imagination/pvr_vm.c @@ -293,8 +293,9 @@ pvr_vm_bind_op_map_init(struct pvr_vm_bind_op *bind_op, static int pvr_vm_bind_op_unmap_init(struct pvr_vm_bind_op *bind_op, - struct pvr_vm_context *vm_ctx, u64 device_addr, - u64 size) + struct pvr_vm_context *vm_ctx, + struct pvr_gem_object *pvr_obj, + u64 device_addr, u64 size) { int err; @@ -318,6 +319,7 @@ pvr_vm_bind_op_unmap_init(struct pvr_vm_bind_op *bind_op, goto err_bind_op_fini; } + bind_op->pvr_obj = pvr_obj; bind_op->vm_ctx = vm_ctx; bind_op->device_addr = device_addr; bind_op->size = size; @@ -597,20 +599,6 @@ pvr_vm_create_context(struct pvr_device *pvr_dev, bool is_userspace_context) return ERR_PTR(err); } -/** - * pvr_vm_unmap_all() - Unmap all mappings associated with a VM context. - * @vm_ctx: Target VM context. - * - * This function ensures that no mappings are left dangling by unmapping them - * all in order of ascending device-virtual address. - */ -void -pvr_vm_unmap_all(struct pvr_vm_context *vm_ctx) -{ - WARN_ON(pvr_vm_unmap(vm_ctx, vm_ctx->gpuvm_mgr.mm_start, - vm_ctx->gpuvm_mgr.mm_range)); -} - /** * pvr_vm_context_release() - Teardown a VM context. * @ref_count: Pointer to reference counter of the VM context. @@ -703,11 +691,7 @@ pvr_vm_lock_extra(struct drm_gpuvm_exec *vm_exec) struct pvr_vm_bind_op *bind_op = vm_exec->extra.priv; struct pvr_gem_object *pvr_obj = bind_op->pvr_obj; - /* Unmap operations don't have an object to lock. */ - if (!pvr_obj) - return 0; - - /* Acquire lock on the GEM being mapped. */ + /* Acquire lock on the GEM object being mapped/unmapped. */ return drm_exec_lock_obj(&vm_exec->exec, gem_from_pvr_gem(pvr_obj)); } @@ -772,8 +756,10 @@ pvr_vm_map(struct pvr_vm_context *vm_ctx, struct pvr_gem_object *pvr_obj, } /** - * pvr_vm_unmap() - Unmap an already mapped section of device-virtual memory. + * pvr_vm_unmap_obj_locked() - Unmap an already mapped section of device-virtual + * memory. * @vm_ctx: Target VM context. + * @pvr_obj: Target PowerVR memory object. * @device_addr: Virtual device address at the start of the target mapping. * @size: Size of the target mapping. * @@ -784,9 +770,13 @@ pvr_vm_map(struct pvr_vm_context *vm_ctx, struct pvr_gem_object *pvr_obj, * * Any error encountered while performing internal operations required to * destroy the mapping (returned from pvr_vm_gpuva_unmap or * pvr_vm_gpuva_remap). + * + * The vm_ctx->lock must be held when calling this function. */ -int -pvr_vm_unmap(struct pvr_vm_context *vm_ctx, u64 device_addr, u64 size) +static int +pvr_vm_unmap_obj_locked(struct pvr_vm_context *vm_ctx, + struct pvr_gem_object *pvr_obj, + u64 device_addr, u64 size) { struct pvr_vm_bind_op bind_op = {0}; struct drm_gpuvm_exec vm_exec = { @@ -799,11 +789,13 @@ pvr_vm_unmap(struct pvr_vm_context *vm_ctx, u64 device_addr, u64 size) }, }; - int err = pvr_vm_bind_op_unmap_init(&bind_op, vm_ctx, device_addr, - size); + int err = pvr_vm_bind_op_unmap_init(&bind_op, vm_ctx, pvr_obj, + device_addr, size); if (err) return err; + pvr_gem_object_get(pvr_obj); + err = drm_gpuvm_exec_lock(&vm_exec); if (err) goto err_cleanup; @@ -818,6 +810,96 @@ pvr_vm_unmap(struct pvr_vm_context *vm_ctx, u64 device_addr, u64 size) return err; } +/** + * pvr_vm_unmap_obj() - Unmap an already mapped section of device-virtual + * memory. + * @vm_ctx: Target VM context. + * @pvr_obj: Target PowerVR memory object. + * @device_addr: Virtual device address at the start of the target mapping. + * @size: Size of the target mapping. + * + * Return: + * * 0 on success, + * * Any error encountered by pvr_vm_unmap_obj_locked. + */ +int +pvr_vm_unmap_obj(struct pvr_vm_context *vm_ctx, struct pvr_gem_object *pvr_obj, + u64 device_addr, u64 size) +{ + int err; + + mutex_lock(&vm_ctx->lock); + err = pvr_vm_unmap_obj_locked(vm_ctx, pvr_obj, device_addr, size); + mutex_unlock(&vm_ctx->lock); + + return err; +} + +/** + * pvr_vm_unmap() - Unmap an already mapped section of device-virtual memory. + * @vm_ctx: Target VM context. + * @device_addr: Virtual device address at the start of the target mapping. + * @size: Size of the target mapping. + * + * Return: + * * 0 on success, + * * Any error encountered by drm_gpuva_find, + * * Any error encountered by pvr_vm_unmap_obj_locked. + */ +int +pvr_vm_unmap(struct pvr_vm_context *vm_ctx, u64 device_addr, u64 size) +{ + struct pvr_gem_object *pvr_obj; + struct drm_gpuva *va; + int err; + + mutex_lock(&vm_ctx->lock); + + va = drm_gpuva_find(&vm_ctx->gpuvm_mgr, device_addr, size); + if (va) { + pvr_obj = gem_to_pvr_gem(va->gem.obj); + err = pvr_vm_unmap_obj_locked(vm_ctx, pvr_obj, + va->va.addr, va->va.range); + } else { + err = -ENOENT; + } + + mutex_unlock(&vm_ctx->lock); + + return err; +} + +/** + * pvr_vm_unmap_all() - Unmap all mappings associated with a VM context. + * @vm_ctx: Target VM context. + * + * This function ensures that no mappings are left dangling by unmapping them + * all in order of ascending device-virtual address. + */ +void +pvr_vm_unmap_all(struct pvr_vm_context *vm_ctx) +{ + mutex_lock(&vm_ctx->lock); + + for (;;) { + struct pvr_gem_object *pvr_obj; + struct drm_gpuva *va; + + va = drm_gpuva_find_first(&vm_ctx->gpuvm_mgr, + vm_ctx->gpuvm_mgr.mm_start, + vm_ctx->gpuvm_mgr.mm_range); + if (!va) + break; + + pvr_obj = gem_to_pvr_gem(va->gem.obj); + + WARN_ON(pvr_vm_unmap_obj_locked(vm_ctx, pvr_obj, + va->va.addr, va->va.range)); + } + + mutex_unlock(&vm_ctx->lock); +} + /* Static data areas are determined by firmware. */ static const struct drm_pvr_static_data_area static_data_areas[] = { { diff --git a/drivers/gpu/drm/imagination/pvr_vm.h b/drivers/gpu/drm/imagination/pvr_vm.h index 79406243617c1..b0528dffa7f1b 100644 --- a/drivers/gpu/drm/imagination/pvr_vm.h +++ b/drivers/gpu/drm/imagination/pvr_vm.h @@ -38,6 +38,9 @@ struct pvr_vm_context *pvr_vm_create_context(struct pvr_device *pvr_dev, int pvr_vm_map(struct pvr_vm_context *vm_ctx, struct pvr_gem_object *pvr_obj, u64 pvr_obj_offset, u64 device_addr, u64 size); +int pvr_vm_unmap_obj(struct pvr_vm_context *vm_ctx, + struct pvr_gem_object *pvr_obj, + u64 device_addr, u64 size); int pvr_vm_unmap(struct pvr_vm_context *vm_ctx, u64 device_addr, u64 size); void pvr_vm_unmap_all(struct pvr_vm_context *vm_ctx); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 65d38b25c0707..699b0dd34b18f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -813,10 +813,10 @@ static int a6xx_gmu_fw_load(struct a6xx_gmu *gmu) } ver = gmu_read(gmu, REG_A6XX_GMU_CORE_FW_VERSION); - DRM_INFO("Loaded GMU firmware v%u.%u.%u\n", - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MAJOR__MASK, ver), - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MINOR__MASK, ver), - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_STEP__MASK, ver)); + DRM_INFO_ONCE("Loaded GMU firmware v%u.%u.%u\n", + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MAJOR__MASK, ver), + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MINOR__MASK, ver), + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_STEP__MASK, ver)); return 0; } diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h index 421afacb72480..36cc9dbc00b5c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h @@ -297,7 +297,7 @@ static const struct dpu_wb_cfg sm8150_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h index 641023b102bf5..e8eacdb47967a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h @@ -304,7 +304,7 @@ static const struct dpu_wb_cfg sc8180x_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h index 621a2140f675f..d761ed705bac3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h @@ -116,14 +116,12 @@ static const struct dpu_lm_cfg sm6150_lm[] = { .sblk = &sdm845_lm_sblk, .pingpong = PINGPONG_0, .dspp = DSPP_0, - .lm_pair = LM_1, }, { .name = "lm_1", .id = LM_1, .base = 0x45000, .len = 0x320, .features = MIXER_QCM2290_MASK, .sblk = &sdm845_lm_sblk, .pingpong = PINGPONG_1, - .lm_pair = LM_0, }, { .name = "lm_2", .id = LM_2, .base = 0x46000, .len = 0x320, diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h index d039b96beb97c..76f60a2df7a89 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h @@ -144,7 +144,7 @@ static const struct dpu_wb_cfg sm6125_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index 7191b1a6d41b3..e5dcd41a361f4 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -1228,8 +1228,6 @@ static int dpu_crtc_reassign_planes(struct drm_crtc *crtc, struct drm_crtc_state done: kfree(states); return ret; - - return 0; } static int dpu_crtc_atomic_check(struct drm_crtc *crtc, diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 5172ab4dea995..48e6e8d74c855 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -2281,6 +2281,9 @@ void dpu_encoder_helper_phys_cleanup(struct dpu_encoder_phys *phys_enc) } } + if (phys_enc->hw_pp && phys_enc->hw_pp->ops.setup_dither) + phys_enc->hw_pp->ops.setup_dither(phys_enc->hw_pp, NULL); + /* reset the merge 3D HW block */ if (phys_enc->hw_pp && phys_enc->hw_pp->merge_3d) { phys_enc->hw_pp->merge_3d->ops.setup_3d_mode(phys_enc->hw_pp->merge_3d, diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c index 657200401f576..cec6d4e8baec4 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c @@ -52,6 +52,7 @@ static void dpu_hw_dsc_config(struct dpu_hw_dsc *hw_dsc, u32 slice_last_group_size; u32 det_thresh_flatness; bool is_cmd_mode = !(mode & DSC_MODE_VIDEO); + bool input_10_bits = dsc->bits_per_component == 10; DPU_REG_WRITE(c, DSC_COMMON_MODE, mode); @@ -68,7 +69,7 @@ static void dpu_hw_dsc_config(struct dpu_hw_dsc *hw_dsc, data |= (dsc->line_buf_depth << 3); data |= (dsc->simple_422 << 2); data |= (dsc->convert_rgb << 1); - data |= dsc->bits_per_component; + data |= input_10_bits; DPU_REG_WRITE(c, DSC_ENC, data); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c index ad19330de61ab..562a3f4c5238a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c @@ -272,7 +272,7 @@ static void _setup_mdp_ops(struct dpu_hw_mdp_ops *ops, if (cap & BIT(DPU_MDP_VSYNC_SEL)) ops->setup_vsync_source = dpu_hw_setup_vsync_sel; - else + else if (!(cap & BIT(DPU_MDP_PERIPH_0_REMOVED))) ops->setup_vsync_source = dpu_hw_setup_wd_timer; ops->get_safe_status = dpu_hw_get_safe_status; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index 098abc2c0003c..af3e541f60c30 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -1164,7 +1164,6 @@ int dpu_assign_plane_resources(struct dpu_global_state *global_state, unsigned int num_planes) { unsigned int i; - int ret; for (i = 0; i < num_planes; i++) { struct drm_plane_state *plane_state = states[i]; @@ -1173,13 +1172,13 @@ int dpu_assign_plane_resources(struct dpu_global_state *global_state, !plane_state->visible) continue; - ret = dpu_plane_virtual_assign_resources(crtc, global_state, + int ret = dpu_plane_virtual_assign_resources(crtc, global_state, state, plane_state); if (ret) - break; + return ret; } - return ret; + return 0; } static void dpu_plane_flush_csc(struct dpu_plane *pdpu, struct dpu_sw_pipe *pipe) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 24dd37f1682bf..3898850739abb 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -930,16 +930,17 @@ enum drm_mode_status msm_dp_bridge_mode_valid(struct drm_bridge *bridge, return -EINVAL; } - if (mode->clock > DP_MAX_PIXEL_CLK_KHZ) - return MODE_CLOCK_HIGH; - msm_dp_display = container_of(dp, struct msm_dp_display_private, msm_dp_display); link_info = &msm_dp_display->panel->link_info; - if (drm_mode_is_420_only(&dp->connector->display_info, mode) && - msm_dp_display->panel->vsc_sdp_supported) + if ((drm_mode_is_420_only(&dp->connector->display_info, mode) && + msm_dp_display->panel->vsc_sdp_supported) || + msm_dp_wide_bus_available(dp)) mode_pclk_khz /= 2; + if (mode_pclk_khz > DP_MAX_PIXEL_CLK_KHZ) + return MODE_CLOCK_HIGH; + mode_bpp = dp->connector->display_info.bpc * num_components; if (!mode_bpp) mode_bpp = default_bpp; diff --git a/drivers/gpu/drm/msm/dp/dp_drm.c b/drivers/gpu/drm/msm/dp/dp_drm.c index d3e241ea69416..16b7913d1eefa 100644 --- a/drivers/gpu/drm/msm/dp/dp_drm.c +++ b/drivers/gpu/drm/msm/dp/dp_drm.c @@ -257,7 +257,10 @@ static enum drm_mode_status msm_edp_bridge_mode_valid(struct drm_bridge *bridge, return -EINVAL; } - if (mode->clock > DP_MAX_PIXEL_CLK_KHZ) + if (msm_dp_wide_bus_available(dp)) + mode_pclk_khz /= 2; + + if (mode_pclk_khz > DP_MAX_PIXEL_CLK_KHZ) return MODE_CLOCK_HIGH; /* diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c index 031446c87daec..798168180c1ab 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c @@ -83,6 +83,9 @@ struct dsi_pll_7nm { /* protects REG_DSI_7nm_PHY_CMN_CLK_CFG0 register */ spinlock_t postdiv_lock; + /* protects REG_DSI_7nm_PHY_CMN_CLK_CFG1 register */ + spinlock_t pclk_mux_lock; + struct pll_7nm_cached_state cached_state; struct dsi_pll_7nm *slave; @@ -372,22 +375,41 @@ static void dsi_pll_enable_pll_bias(struct dsi_pll_7nm *pll) ndelay(250); } -static void dsi_pll_disable_global_clk(struct dsi_pll_7nm *pll) +static void dsi_pll_cmn_clk_cfg0_write(struct dsi_pll_7nm *pll, u32 val) { + unsigned long flags; + + spin_lock_irqsave(&pll->postdiv_lock, flags); + writel(val, pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG0); + spin_unlock_irqrestore(&pll->postdiv_lock, flags); +} + +static void dsi_pll_cmn_clk_cfg1_update(struct dsi_pll_7nm *pll, u32 mask, + u32 val) +{ + unsigned long flags; u32 data; + spin_lock_irqsave(&pll->pclk_mux_lock, flags); data = readl(pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - writel(data & ~BIT(5), pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + data &= ~mask; + data |= val & mask; + + writel(data, pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + spin_unlock_irqrestore(&pll->pclk_mux_lock, flags); +} + +static void dsi_pll_disable_global_clk(struct dsi_pll_7nm *pll) +{ + dsi_pll_cmn_clk_cfg1_update(pll, DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN, 0); } static void dsi_pll_enable_global_clk(struct dsi_pll_7nm *pll) { - u32 data; + u32 cfg_1 = DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN | DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN_SEL; writel(0x04, pll->phy->base + REG_DSI_7nm_PHY_CMN_CTRL_3); - - data = readl(pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - writel(data | BIT(5) | BIT(4), pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg1_update(pll, cfg_1, cfg_1); } static void dsi_pll_phy_dig_reset(struct dsi_pll_7nm *pll) @@ -565,7 +587,6 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) { struct dsi_pll_7nm *pll_7nm = to_pll_7nm(phy->vco_hw); struct pll_7nm_cached_state *cached = &pll_7nm->cached_state; - void __iomem *phy_base = pll_7nm->phy->base; u32 val; int ret; @@ -574,13 +595,10 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) val |= cached->pll_out_div; writel(val, pll_7nm->phy->pll_base + REG_DSI_7nm_PHY_PLL_PLL_OUTDIV_RATE); - writel(cached->bit_clk_div | (cached->pix_clk_div << 4), - phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG0); - - val = readl(phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - val &= ~0x3; - val |= cached->pll_mux; - writel(val, phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg0_write(pll_7nm, + DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_3_0(cached->bit_clk_div) | + DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_7_4(cached->pix_clk_div)); + dsi_pll_cmn_clk_cfg1_update(pll_7nm, 0x3, cached->pll_mux); ret = dsi_pll_7nm_vco_set_rate(phy->vco_hw, pll_7nm->vco_current_rate, @@ -599,7 +617,6 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) static int dsi_7nm_set_usecase(struct msm_dsi_phy *phy) { struct dsi_pll_7nm *pll_7nm = to_pll_7nm(phy->vco_hw); - void __iomem *base = phy->base; u32 data = 0x0; /* internal PLL */ DBG("DSI PLL%d", pll_7nm->phy->id); @@ -618,7 +635,8 @@ static int dsi_7nm_set_usecase(struct msm_dsi_phy *phy) } /* set PLL src */ - writel(data << 2, base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg1_update(pll_7nm, DSI_7nm_PHY_CMN_CLK_CFG1_BITCLK_SEL__MASK, + DSI_7nm_PHY_CMN_CLK_CFG1_BITCLK_SEL(data)); return 0; } @@ -733,7 +751,7 @@ static int pll_7nm_register(struct dsi_pll_7nm *pll_7nm, struct clk_hw **provide pll_by_2_bit, }), 2, 0, pll_7nm->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1, - 0, 1, 0, NULL); + 0, 1, 0, &pll_7nm->pclk_mux_lock); if (IS_ERR(hw)) { ret = PTR_ERR(hw); goto fail; @@ -778,6 +796,7 @@ static int dsi_pll_7nm_init(struct msm_dsi_phy *phy) pll_7nm_list[phy->id] = pll_7nm; spin_lock_init(&pll_7nm->postdiv_lock); + spin_lock_init(&pll_7nm->pclk_mux_lock); pll_7nm->phy = phy; diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index fee31680a6d54..a650778552017 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -537,15 +537,12 @@ static inline int align_pitch(int width, int bpp) static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) { ktime_t now = ktime_get(); - s64 remaining_jiffies; - if (ktime_compare(*timeout, now) < 0) { - remaining_jiffies = 0; - } else { - ktime_t rem = ktime_sub(*timeout, now); - remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); - } + if (ktime_compare(*timeout, now) <= 0) + return 0; + ktime_t rem = ktime_sub(*timeout, now); + s64 remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); return clamp(remaining_jiffies, 1LL, (s64)INT_MAX); } diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c index 1a5d4f1c8b422..d41e5a6bbee09 100644 --- a/drivers/gpu/drm/msm/msm_fence.c +++ b/drivers/gpu/drm/msm/msm_fence.c @@ -65,8 +65,7 @@ msm_fence_context_alloc(struct drm_device *dev, volatile uint32_t *fenceptr, fctx->completed_fence = fctx->last_fence; *fctx->fenceptr = fctx->last_fence; - hrtimer_init(&fctx->deadline_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - fctx->deadline_timer.function = deadline_timer; + hrtimer_setup(&fctx->deadline_timer, deadline_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); kthread_init_work(&fctx->deadline_work, deadline_work); diff --git a/drivers/gpu/drm/msm/msm_io_utils.c b/drivers/gpu/drm/msm/msm_io_utils.c index afedd61c3e288..a6efe1eac2712 100644 --- a/drivers/gpu/drm/msm/msm_io_utils.c +++ b/drivers/gpu/drm/msm/msm_io_utils.c @@ -135,8 +135,7 @@ void msm_hrtimer_work_init(struct msm_hrtimer_work *work, clockid_t clock_id, enum hrtimer_mode mode) { - hrtimer_init(&work->timer, clock_id, mode); - work->timer.function = msm_hrtimer_worktimer; + hrtimer_setup(&work->timer, msm_hrtimer_worktimer, clock_id, mode); work->worker = worker; kthread_init_work(&work->work, fn); } diff --git a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml index d54b72f924493..35f7f40e405b7 100644 --- a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml +++ b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml @@ -9,8 +9,15 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd"> - - + + + + + + + + + diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index ce840300578d8..1050a4617fc15 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -4,6 +4,7 @@ config DRM_NOUVEAU depends on DRM && PCI && MMU select IOMMU_API select FW_LOADER + select FW_CACHE if PM_SLEEP select DRM_CLIENT_SELECTION select DRM_DISPLAY_DP_HELPER select DRM_DISPLAY_HDMI_HELPER diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 8d5c9c74cbb90..eac0d1d2dbda2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -775,7 +775,6 @@ nouveau_connector_force(struct drm_connector *connector) if (!nv_encoder) { NV_ERROR(drm, "can't find encoder to force %s on!\n", connector->name); - connector->status = connector_status_disconnected; return; } diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index b4da82ddbb6b2..8ea98f06d39af 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -590,6 +590,7 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, unsigned long timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); struct mm_struct *mm = svmm->notifier.mm; + struct folio *folio; struct page *page; unsigned long start = args->p.addr; unsigned long notifier_seq; @@ -616,12 +617,16 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, ret = -EINVAL; goto out; } + folio = page_folio(page); mutex_lock(&svmm->mutex); if (!mmu_interval_read_retry(¬ifier->notifier, notifier_seq)) break; mutex_unlock(&svmm->mutex); + + folio_unlock(folio); + folio_put(folio); } /* Map the page on the GPU. */ @@ -637,8 +642,8 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL); mutex_unlock(&svmm->mutex); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: mmu_interval_notifier_remove(¬ifier->notifier); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c index a6f410ba60bc9..d393bc540f862 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c @@ -75,7 +75,7 @@ gp10b_pmu_acr = { .bootstrap_multiple_falcons = gp10b_pmu_acr_bootstrap_multiple_falcons, }; -#if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) +#if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) MODULE_FIRMWARE("nvidia/gp10b/pmu/desc.bin"); MODULE_FIRMWARE("nvidia/gp10b/pmu/image.bin"); MODULE_FIRMWARE("nvidia/gp10b/pmu/sig.bin"); diff --git a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c index 45d09e6fa667f..7d68a8acfe2ea 100644 --- a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c +++ b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c @@ -109,13 +109,13 @@ static int jadard_prepare(struct drm_panel *panel) if (jadard->desc->lp11_to_reset_delay_ms) msleep(jadard->desc->lp11_to_reset_delay_ms); - gpiod_set_value(jadard->reset, 1); + gpiod_set_value(jadard->reset, 0); msleep(5); - gpiod_set_value(jadard->reset, 0); + gpiod_set_value(jadard->reset, 1); msleep(10); - gpiod_set_value(jadard->reset, 1); + gpiod_set_value(jadard->reset, 0); msleep(130); ret = jadard->desc->init(jadard); @@ -1130,7 +1130,7 @@ static int jadard_dsi_probe(struct mipi_dsi_device *dsi) dsi->format = desc->format; dsi->lanes = desc->lanes; - jadard->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); + jadard->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_HIGH); if (IS_ERR(jadard->reset)) { DRM_DEV_ERROR(&dsi->dev, "failed to get our reset GPIO\n"); return PTR_ERR(jadard->reset); diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 05c13102a8cb8..d22889fbfa9c8 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -359,7 +359,8 @@ int r300_mc_wait_for_idle(struct radeon_device *rdev) return -1; } -static void r300_gpu_init(struct radeon_device *rdev) +/* rs400_gpu_init also calls this! */ +void r300_gpu_init(struct radeon_device *rdev) { uint32_t gb_tile_config, tmp; diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index 1e00f6b99f94b..8f5e07834fcc6 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -165,6 +165,7 @@ void r200_set_safe_registers(struct radeon_device *rdev); */ extern int r300_init(struct radeon_device *rdev); extern void r300_fini(struct radeon_device *rdev); +extern void r300_gpu_init(struct radeon_device *rdev); extern int r300_suspend(struct radeon_device *rdev); extern int r300_resume(struct radeon_device *rdev); extern int r300_asic_reset(struct radeon_device *rdev, bool hard); diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c index d6c18fd740ec6..13cd0a688a65c 100644 --- a/drivers/gpu/drm/radeon/rs400.c +++ b/drivers/gpu/drm/radeon/rs400.c @@ -256,8 +256,22 @@ int rs400_mc_wait_for_idle(struct radeon_device *rdev) static void rs400_gpu_init(struct radeon_device *rdev) { - /* FIXME: is this correct ? */ - r420_pipes_init(rdev); + /* Earlier code was calling r420_pipes_init and then + * rs400_mc_wait_for_idle(rdev). The problem is that + * at least on my Mobility Radeon Xpress 200M RC410 card + * that ends up in this code path ends up num_gb_pipes == 3 + * while the card seems to have only one pipe. With the + * r420 pipe initialization method. + * + * Problems shown up as HyperZ glitches, see: + * https://bugs.freedesktop.org/show_bug.cgi?id=110897 + * + * Delegating initialization to r300 code seems to work + * and results in proper pipe numbers. The rs400 cards + * are said to be not r400, but r300 kind of cards. + */ + r300_gpu_init(rdev); + if (rs400_mc_wait_for_idle(rdev)) { pr_warn("rs400: Failed to wait MC idle while programming pipes. Bad things might happen. %08x\n", RREG32(RADEON_MC_STATUS)); diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h index c75302ca3427c..f56e77e7f6d02 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h +++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h @@ -21,7 +21,7 @@ * */ -#if !defined(_GPU_SCHED_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#if !defined(_GPU_SCHED_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _GPU_SCHED_TRACE_H_ #include @@ -106,7 +106,7 @@ TRACE_EVENT(drm_sched_job_wait_dep, __entry->seqno) ); -#endif +#endif /* _GPU_SCHED_TRACE_H_ */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/gpu/drm/tiny/bochs.c b/drivers/gpu/drm/tiny/bochs.c index c67e1f9067859..8706763af8fba 100644 --- a/drivers/gpu/drm/tiny/bochs.c +++ b/drivers/gpu/drm/tiny/bochs.c @@ -335,8 +335,6 @@ static void bochs_hw_setmode(struct bochs_device *bochs, struct drm_display_mode bochs->xres, bochs->yres, bochs->bpp, bochs->yres_virtual); - bochs_hw_blank(bochs, false); - bochs_dispi_write(bochs, VBE_DISPI_INDEX_ENABLE, 0); bochs_dispi_write(bochs, VBE_DISPI_INDEX_BPP, bochs->bpp); bochs_dispi_write(bochs, VBE_DISPI_INDEX_XRES, bochs->xres); @@ -506,6 +504,9 @@ static int bochs_crtc_helper_atomic_check(struct drm_crtc *crtc, static void bochs_crtc_helper_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state) { + struct bochs_device *bochs = to_bochs_device(crtc->dev); + + bochs_hw_blank(bochs, false); } static void bochs_crtc_helper_atomic_disable(struct drm_crtc *crtc, diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c index b20ac17057262..fa269d279e257 100644 --- a/drivers/gpu/drm/vkms/vkms_composer.c +++ b/drivers/gpu/drm/vkms/vkms_composer.c @@ -67,7 +67,7 @@ static u16 lerp_u16(u16 a, u16 b, s64 t) s64 delta = drm_fixp_mul(b_fp - a_fp, t); - return drm_fixp2int(a_fp + delta); + return drm_fixp2int_round(a_fp + delta); } static s64 get_lut_index(const struct vkms_color_lut *lut, u16 channel_value) diff --git a/drivers/gpu/drm/vkms/vkms_crtc.c b/drivers/gpu/drm/vkms/vkms_crtc.c index 28a57ae109fcc..ae4e36bc337c4 100644 --- a/drivers/gpu/drm/vkms/vkms_crtc.c +++ b/drivers/gpu/drm/vkms/vkms_crtc.c @@ -64,8 +64,8 @@ static int vkms_enable_vblank(struct drm_crtc *crtc) struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct vkms_output *out = drm_crtc_to_vkms_output(crtc); - hrtimer_init(&out->vblank_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - out->vblank_hrtimer.function = &vkms_vblank_simulate; + hrtimer_setup(&out->vblank_hrtimer, &vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); out->period_ns = ktime_set(0, vblank->framedur_ns); hrtimer_start(&out->vblank_hrtimer, out->period_ns, HRTIMER_MODE_REL); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c index 8651b788e98bc..aec774fa4d7bf 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c @@ -290,8 +290,8 @@ vmw_vkms_enable_vblank(struct drm_crtc *crtc) drm_calc_timestamping_constants(crtc, &crtc->mode); - hrtimer_init(&du->vkms.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - du->vkms.timer.function = &vmw_vkms_vblank_simulate; + hrtimer_setup(&du->vkms.timer, &vmw_vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); du->vkms.period_ns = ktime_set(0, vblank->framedur_ns); hrtimer_start(&du->vkms.timer, du->vkms.period_ns, HRTIMER_MODE_REL); diff --git a/drivers/gpu/drm/xe/display/xe_plane_initial.c b/drivers/gpu/drm/xe/display/xe_plane_initial.c index 2eb9633f163a7..2a2f250fa495d 100644 --- a/drivers/gpu/drm/xe/display/xe_plane_initial.c +++ b/drivers/gpu/drm/xe/display/xe_plane_initial.c @@ -194,8 +194,6 @@ intel_find_initial_plane_obj(struct intel_crtc *crtc, to_intel_plane(crtc->base.primary); struct intel_plane_state *plane_state = to_intel_plane_state(plane->base.state); - struct intel_crtc_state *crtc_state = - to_intel_crtc_state(crtc->base.state); struct drm_framebuffer *fb; struct i915_vma *vma; @@ -241,14 +239,6 @@ intel_find_initial_plane_obj(struct intel_crtc *crtc, atomic_or(plane->frontbuffer_bit, &to_intel_frontbuffer(fb)->bits); plane_config->vma = vma; - - /* - * Flip to the newly created mapping ASAP, so we can re-use the - * first part of GGTT for WOPCM, prevent flickering, and prevent - * the lookup of sysmem scratch pages. - */ - plane->check_plane(crtc_state, plane_state); - plane->async_flip(NULL, plane, crtc_state, plane_state, true); return; nofb: diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index d86219dedde2a..b732c89816dff 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -53,7 +53,6 @@ #define RING_CTL(base) XE_REG((base) + 0x3c) #define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ -#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ #define RING_START_UDW(base) XE_REG((base) + 0x48) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 5d6fb79957b63..9f4f27d1ef4a9 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -380,9 +380,7 @@ int xe_gt_init_early(struct xe_gt *gt) if (err) return err; - xe_wa_process_gt(gt); xe_wa_process_oob(gt); - xe_tuning_process_gt(gt); xe_force_wake_init_gt(gt, gt_to_fw(gt)); spin_lock_init(>->global_invl_lock); @@ -474,6 +472,8 @@ static int all_fw_domain_init(struct xe_gt *gt) } xe_gt_mcr_set_implicit_defaults(gt); + xe_wa_process_gt(gt); + xe_tuning_process_gt(gt); xe_reg_sr_apply_mmio(>->reg_sr, gt); err = xe_gt_clock_init(gt); diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 50c8076b51585..72ad576fc18eb 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1723,9 +1723,11 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, drm_printf(p, "\tg2h outstanding: %d\n", snapshot->g2h_outstanding); - if (snapshot->ctb) - xe_print_blob_ascii85(p, "CTB data", '\n', + if (snapshot->ctb) { + drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size); + xe_print_blob_ascii85(p, "[CTB].data", '\n', snapshot->ctb, 0, snapshot->ctb_size); + } } else { drm_puts(p, "CT disabled\n"); } diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index 2baa4d95571fb..0ca3056d8bd3f 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -208,10 +208,11 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_ drm_printf(p, "GuC timestamp: 0x%08llX [%llu]\n", snapshot->stamp, snapshot->stamp); drm_printf(p, "Log level: %u\n", snapshot->level); + drm_printf(p, "[LOG].length: 0x%zx\n", snapshot->size); remain = snapshot->size; for (i = 0; i < snapshot->num_chunks; i++) { size_t size = min(GUC_LOG_CHUNK_SIZE, remain); - const char *prefix = i ? NULL : "Log data"; + const char *prefix = i ? NULL : "[LOG].data"; char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0; xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size); diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 913c74d6e2aeb..b6a2dd742ebdc 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1248,6 +1248,8 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); + /* Confirm no work left behind accessing device structures */ + cancel_delayed_work_sync(&ge->sched.base.work_tdr); release_guc_id(guc, q); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); diff --git a/drivers/gpu/drm/xe/xe_hmm.c b/drivers/gpu/drm/xe/xe_hmm.c index 0898344678801..392102515f3d8 100644 --- a/drivers/gpu/drm/xe/xe_hmm.c +++ b/drivers/gpu/drm/xe/xe_hmm.c @@ -19,11 +19,10 @@ static u64 xe_npages_in_range(unsigned long start, unsigned long end) return (end - start) >> PAGE_SHIFT; } -/* +/** * xe_mark_range_accessed() - mark a range is accessed, so core mm * have such information for memory eviction or write back to * hard disk - * * @range: the range to mark * @write: if write to this range, we mark pages in this range * as dirty @@ -43,15 +42,51 @@ static void xe_mark_range_accessed(struct hmm_range *range, bool write) } } -/* +static int xe_alloc_sg(struct xe_device *xe, struct sg_table *st, + struct hmm_range *range, struct rw_semaphore *notifier_sem) +{ + unsigned long i, npages, hmm_pfn; + unsigned long num_chunks = 0; + int ret; + + /* HMM docs says this is needed. */ + ret = down_read_interruptible(notifier_sem); + if (ret) + return ret; + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + up_read(notifier_sem); + return -EAGAIN; + } + + npages = xe_npages_in_range(range->start, range->end); + for (i = 0; i < npages;) { + unsigned long len; + + hmm_pfn = range->hmm_pfns[i]; + xe_assert(xe, hmm_pfn & HMM_PFN_VALID); + + len = 1UL << hmm_pfn_to_map_order(hmm_pfn); + + /* If order > 0 the page may extend beyond range->start */ + len -= (hmm_pfn & ~HMM_PFN_FLAGS) & (len - 1); + i += len; + num_chunks++; + } + up_read(notifier_sem); + + return sg_alloc_table(st, num_chunks, GFP_KERNEL); +} + +/** * xe_build_sg() - build a scatter gather table for all the physical pages/pfn * in a hmm_range. dma-map pages if necessary. dma-address is save in sg table * and will be used to program GPU page table later. - * * @xe: the xe device who will access the dma-address in sg table * @range: the hmm range that we build the sg table from. range->hmm_pfns[] * has the pfn numbers of pages that back up this hmm address range. * @st: pointer to the sg table. + * @notifier_sem: The xe notifier lock. * @write: whether we write to this range. This decides dma map direction * for system pages. If write we map it bi-diretional; otherwise * DMA_TO_DEVICE @@ -78,43 +113,84 @@ static void xe_mark_range_accessed(struct hmm_range *range, bool write) * Returns 0 if successful; -ENOMEM if fails to allocate memory */ static int xe_build_sg(struct xe_device *xe, struct hmm_range *range, - struct sg_table *st, bool write) + struct sg_table *st, + struct rw_semaphore *notifier_sem, + bool write) { + unsigned long npages = xe_npages_in_range(range->start, range->end); struct device *dev = xe->drm.dev; - struct page **pages; - u64 i, npages; - int ret; + struct scatterlist *sgl; + struct page *page; + unsigned long i, j; - npages = xe_npages_in_range(range->start, range->end); - pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL); - if (!pages) - return -ENOMEM; + lockdep_assert_held(notifier_sem); - for (i = 0; i < npages; i++) { - pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]); - xe_assert(xe, !is_device_private_page(pages[i])); + i = 0; + for_each_sg(st->sgl, sgl, st->nents, j) { + unsigned long hmm_pfn, size; + + hmm_pfn = range->hmm_pfns[i]; + page = hmm_pfn_to_page(hmm_pfn); + xe_assert(xe, !is_device_private_page(page)); + + size = 1UL << hmm_pfn_to_map_order(hmm_pfn); + size -= page_to_pfn(page) & (size - 1); + i += size; + + if (unlikely(j == st->nents - 1)) { + if (i > npages) + size -= (i - npages); + sg_mark_end(sgl); + } + sg_set_page(sgl, page, size << PAGE_SHIFT, 0); } + xe_assert(xe, i == npages); - ret = sg_alloc_table_from_pages_segment(st, pages, npages, 0, npages << PAGE_SHIFT, - xe_sg_segment_size(dev), GFP_KERNEL); - if (ret) - goto free_pages; + return dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING); +} + +static void xe_hmm_userptr_set_mapped(struct xe_userptr_vma *uvma) +{ + struct xe_userptr *userptr = &uvma->userptr; + struct xe_vm *vm = xe_vma_vm(&uvma->vma); + + lockdep_assert_held_write(&vm->lock); + lockdep_assert_held(&vm->userptr.notifier_lock); + + mutex_lock(&userptr->unmap_mutex); + xe_assert(vm->xe, !userptr->mapped); + userptr->mapped = true; + mutex_unlock(&userptr->unmap_mutex); +} + +void xe_hmm_userptr_unmap(struct xe_userptr_vma *uvma) +{ + struct xe_userptr *userptr = &uvma->userptr; + struct xe_vma *vma = &uvma->vma; + bool write = !xe_vma_read_only(vma); + struct xe_vm *vm = xe_vma_vm(vma); + struct xe_device *xe = vm->xe; - ret = dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING); - if (ret) { - sg_free_table(st); - st = NULL; + if (!lockdep_is_held_type(&vm->userptr.notifier_lock, 0) && + !lockdep_is_held_type(&vm->lock, 0) && + !(vma->gpuva.flags & XE_VMA_DESTROYED)) { + /* Don't unmap in exec critical section. */ + xe_vm_assert_held(vm); + /* Don't unmap while mapping the sg. */ + lockdep_assert_held(&vm->lock); } -free_pages: - kvfree(pages); - return ret; + mutex_lock(&userptr->unmap_mutex); + if (userptr->sg && userptr->mapped) + dma_unmap_sgtable(xe->drm.dev, userptr->sg, + write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, 0); + userptr->mapped = false; + mutex_unlock(&userptr->unmap_mutex); } -/* +/** * xe_hmm_userptr_free_sg() - Free the scatter gather table of userptr - * * @uvma: the userptr vma which hold the scatter gather table * * With function xe_userptr_populate_range, we allocate storage of @@ -124,16 +200,9 @@ static int xe_build_sg(struct xe_device *xe, struct hmm_range *range, void xe_hmm_userptr_free_sg(struct xe_userptr_vma *uvma) { struct xe_userptr *userptr = &uvma->userptr; - struct xe_vma *vma = &uvma->vma; - bool write = !xe_vma_read_only(vma); - struct xe_vm *vm = xe_vma_vm(vma); - struct xe_device *xe = vm->xe; - struct device *dev = xe->drm.dev; - - xe_assert(xe, userptr->sg); - dma_unmap_sgtable(dev, userptr->sg, - write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, 0); + xe_assert(xe_vma_vm(&uvma->vma)->xe, userptr->sg); + xe_hmm_userptr_unmap(uvma); sg_free_table(userptr->sg); userptr->sg = NULL; } @@ -166,13 +235,20 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, { unsigned long timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); - unsigned long *pfns, flags = HMM_PFN_REQ_FAULT; + unsigned long *pfns; struct xe_userptr *userptr; struct xe_vma *vma = &uvma->vma; u64 userptr_start = xe_vma_userptr(vma); u64 userptr_end = userptr_start + xe_vma_size(vma); struct xe_vm *vm = xe_vma_vm(vma); - struct hmm_range hmm_range; + struct hmm_range hmm_range = { + .pfn_flags_mask = 0, /* ignore pfns */ + .default_flags = HMM_PFN_REQ_FAULT, + .start = userptr_start, + .end = userptr_end, + .notifier = &uvma->userptr.notifier, + .dev_private_owner = vm->xe, + }; bool write = !xe_vma_read_only(vma); unsigned long notifier_seq; u64 npages; @@ -199,19 +275,14 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, return -ENOMEM; if (write) - flags |= HMM_PFN_REQ_WRITE; + hmm_range.default_flags |= HMM_PFN_REQ_WRITE; if (!mmget_not_zero(userptr->notifier.mm)) { ret = -EFAULT; goto free_pfns; } - hmm_range.default_flags = flags; hmm_range.hmm_pfns = pfns; - hmm_range.notifier = &userptr->notifier; - hmm_range.start = userptr_start; - hmm_range.end = userptr_end; - hmm_range.dev_private_owner = vm->xe; while (true) { hmm_range.notifier_seq = mmu_interval_read_begin(&userptr->notifier); @@ -238,16 +309,37 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, if (ret) goto free_pfns; - ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt, write); + ret = xe_alloc_sg(vm->xe, &userptr->sgt, &hmm_range, &vm->userptr.notifier_lock); if (ret) goto free_pfns; + ret = down_read_interruptible(&vm->userptr.notifier_lock); + if (ret) + goto free_st; + + if (mmu_interval_read_retry(hmm_range.notifier, hmm_range.notifier_seq)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = xe_build_sg(vm->xe, &hmm_range, &userptr->sgt, + &vm->userptr.notifier_lock, write); + if (ret) + goto out_unlock; + xe_mark_range_accessed(&hmm_range, write); userptr->sg = &userptr->sgt; + xe_hmm_userptr_set_mapped(uvma); userptr->notifier_seq = hmm_range.notifier_seq; + up_read(&vm->userptr.notifier_lock); + kvfree(pfns); + return 0; +out_unlock: + up_read(&vm->userptr.notifier_lock); +free_st: + sg_free_table(&userptr->sgt); free_pfns: kvfree(pfns); return ret; } - diff --git a/drivers/gpu/drm/xe/xe_hmm.h b/drivers/gpu/drm/xe/xe_hmm.h index 909dc2bdcd97e..0ea98d8e7bbc7 100644 --- a/drivers/gpu/drm/xe/xe_hmm.h +++ b/drivers/gpu/drm/xe/xe_hmm.h @@ -3,9 +3,16 @@ * Copyright © 2024 Intel Corporation */ +#ifndef _XE_HMM_H_ +#define _XE_HMM_H_ + #include struct xe_userptr_vma; int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, bool is_mm_mmap_locked); + void xe_hmm_userptr_free_sg(struct xe_userptr_vma *uvma); + +void xe_hmm_userptr_unmap(struct xe_userptr_vma *uvma); +#endif diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 32f5a67a917b5..08552ee3fb94b 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -757,19 +757,7 @@ int xe_irq_install(struct xe_device *xe) xe_irq_postinstall(xe); - err = devm_add_action_or_reset(xe->drm.dev, irq_uninstall, xe); - if (err) - goto free_irq_handler; - - return 0; - -free_irq_handler: - if (xe_device_has_msix(xe)) - xe_irq_msix_free(xe); - else - xe_irq_msi_free(xe); - - return err; + return devm_add_action_or_reset(xe->drm.dev, irq_uninstall, xe); } static void xe_irq_msi_synchronize_irq(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index fa873f3d0a9d1..1fa46a04425ee 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1689,7 +1689,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, stream->oa_buffer.format = &stream->oa->oa_formats[param->oa_format]; stream->sample = param->sample; - stream->periodic = param->period_exponent > 0; + stream->periodic = param->period_exponent >= 0; stream->period_exponent = param->period_exponent; stream->no_preempt = param->no_preempt; stream->wait_num_reports = param->wait_num_reports; @@ -1766,8 +1766,8 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, WRITE_ONCE(u->exclusive_stream, stream); - hrtimer_init(&stream->poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - stream->poll_check_timer.function = xe_oa_poll_check_timer_cb; + hrtimer_setup(&stream->poll_check_timer, xe_oa_poll_check_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); @@ -1970,6 +1970,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f } param.xef = xef; + param.period_exponent = -1; ret = xe_oa_user_extensions(oa, XE_OA_USER_EXTN_FROM_OPEN, data, 0, ¶m); if (ret) return ret; @@ -2024,7 +2025,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f goto err_exec_q; } - if (param.period_exponent > 0) { + if (param.period_exponent >= 0) { u64 oa_period, oa_freq_hz; /* Requesting samples from OAG buffer is a privileged operation */ diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 1ddcc7e79a93e..dc24baa840924 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -28,6 +28,8 @@ struct xe_pt_dir { struct xe_pt pt; /** @children: Array of page-table child nodes */ struct xe_ptw *children[XE_PDES]; + /** @staging: Array of page-table staging nodes */ + struct xe_ptw *staging[XE_PDES]; }; #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM) @@ -48,9 +50,10 @@ static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt) return container_of(pt, struct xe_pt_dir, pt); } -static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index) +static struct xe_pt * +xe_pt_entry_staging(struct xe_pt_dir *pt_dir, unsigned int index) { - return container_of(pt_dir->children[index], struct xe_pt, base); + return container_of(pt_dir->staging[index], struct xe_pt, base); } static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm, @@ -125,6 +128,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile, } pt->bo = bo; pt->base.children = level ? as_xe_pt_dir(pt)->children : NULL; + pt->base.staging = level ? as_xe_pt_dir(pt)->staging : NULL; if (vm->xef) xe_drm_client_add_bo(vm->xef->client, pt->bo); @@ -206,8 +210,8 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred) struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt); for (i = 0; i < XE_PDES; i++) { - if (xe_pt_entry(pt_dir, i)) - xe_pt_destroy(xe_pt_entry(pt_dir, i), flags, + if (xe_pt_entry_staging(pt_dir, i)) + xe_pt_destroy(xe_pt_entry_staging(pt_dir, i), flags, deferred); } } @@ -376,8 +380,10 @@ xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent, /* Continue building a non-connected subtree. */ struct iosys_map *map = &parent->bo->vmap; - if (unlikely(xe_child)) + if (unlikely(xe_child)) { parent->base.children[offset] = &xe_child->base; + parent->base.staging[offset] = &xe_child->base; + } xe_pt_write(xe_walk->vm->xe, map, offset, pte); parent->num_live++; @@ -614,6 +620,7 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma, .ops = &xe_pt_stage_bind_ops, .shifts = xe_normal_pt_shifts, .max_level = XE_PT_HIGHEST_LEVEL, + .staging = true, }, .vm = xe_vma_vm(vma), .tile = tile, @@ -873,7 +880,7 @@ static void xe_pt_cancel_bind(struct xe_vma *vma, } } -static void xe_pt_commit_locks_assert(struct xe_vma *vma) +static void xe_pt_commit_prepare_locks_assert(struct xe_vma *vma) { struct xe_vm *vm = xe_vma_vm(vma); @@ -885,6 +892,16 @@ static void xe_pt_commit_locks_assert(struct xe_vma *vma) xe_vm_assert_held(vm); } +static void xe_pt_commit_locks_assert(struct xe_vma *vma) +{ + struct xe_vm *vm = xe_vma_vm(vma); + + xe_pt_commit_prepare_locks_assert(vma); + + if (xe_vma_is_userptr(vma)) + lockdep_assert_held_read(&vm->userptr.notifier_lock); +} + static void xe_pt_commit(struct xe_vma *vma, struct xe_vm_pgtable_update *entries, u32 num_entries, struct llist_head *deferred) @@ -895,13 +912,17 @@ static void xe_pt_commit(struct xe_vma *vma, for (i = 0; i < num_entries; i++) { struct xe_pt *pt = entries[i].pt; + struct xe_pt_dir *pt_dir; if (!pt->level) continue; + pt_dir = as_xe_pt_dir(pt); for (j = 0; j < entries[i].qwords; j++) { struct xe_pt *oldpte = entries[i].pt_entries[j].pt; + int j_ = j + entries[i].ofs; + pt_dir->children[j_] = pt_dir->staging[j_]; xe_pt_destroy(oldpte, xe_vma_vm(vma)->flags, deferred); } } @@ -913,7 +934,7 @@ static void xe_pt_abort_bind(struct xe_vma *vma, { int i, j; - xe_pt_commit_locks_assert(vma); + xe_pt_commit_prepare_locks_assert(vma); for (i = num_entries - 1; i >= 0; --i) { struct xe_pt *pt = entries[i].pt; @@ -928,10 +949,10 @@ static void xe_pt_abort_bind(struct xe_vma *vma, pt_dir = as_xe_pt_dir(pt); for (j = 0; j < entries[i].qwords; j++) { u32 j_ = j + entries[i].ofs; - struct xe_pt *newpte = xe_pt_entry(pt_dir, j_); + struct xe_pt *newpte = xe_pt_entry_staging(pt_dir, j_); struct xe_pt *oldpte = entries[i].pt_entries[j].pt; - pt_dir->children[j_] = oldpte ? &oldpte->base : 0; + pt_dir->staging[j_] = oldpte ? &oldpte->base : 0; xe_pt_destroy(newpte, xe_vma_vm(vma)->flags, NULL); } } @@ -943,7 +964,7 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma, { u32 i, j; - xe_pt_commit_locks_assert(vma); + xe_pt_commit_prepare_locks_assert(vma); for (i = 0; i < num_entries; i++) { struct xe_pt *pt = entries[i].pt; @@ -961,10 +982,10 @@ static void xe_pt_commit_prepare_bind(struct xe_vma *vma, struct xe_pt *newpte = entries[i].pt_entries[j].pt; struct xe_pt *oldpte = NULL; - if (xe_pt_entry(pt_dir, j_)) - oldpte = xe_pt_entry(pt_dir, j_); + if (xe_pt_entry_staging(pt_dir, j_)) + oldpte = xe_pt_entry_staging(pt_dir, j_); - pt_dir->children[j_] = &newpte->base; + pt_dir->staging[j_] = &newpte->base; entries[i].pt_entries[j].pt = oldpte; } } @@ -1213,42 +1234,22 @@ static int vma_check_userptr(struct xe_vm *vm, struct xe_vma *vma, return 0; uvma = to_userptr_vma(vma); - notifier_seq = uvma->userptr.notifier_seq; + if (xe_pt_userptr_inject_eagain(uvma)) + xe_vma_userptr_force_invalidate(uvma); - if (uvma->userptr.initial_bind && !xe_vm_in_fault_mode(vm)) - return 0; + notifier_seq = uvma->userptr.notifier_seq; if (!mmu_interval_read_retry(&uvma->userptr.notifier, - notifier_seq) && - !xe_pt_userptr_inject_eagain(uvma)) + notifier_seq)) return 0; - if (xe_vm_in_fault_mode(vm)) { + if (xe_vm_in_fault_mode(vm)) return -EAGAIN; - } else { - spin_lock(&vm->userptr.invalidated_lock); - list_move_tail(&uvma->userptr.invalidate_link, - &vm->userptr.invalidated); - spin_unlock(&vm->userptr.invalidated_lock); - - if (xe_vm_in_preempt_fence_mode(vm)) { - struct dma_resv_iter cursor; - struct dma_fence *fence; - long err; - - dma_resv_iter_begin(&cursor, xe_vm_resv(vm), - DMA_RESV_USAGE_BOOKKEEP); - dma_resv_for_each_fence_unlocked(&cursor, fence) - dma_fence_enable_sw_signaling(fence); - dma_resv_iter_end(&cursor); - - err = dma_resv_wait_timeout(xe_vm_resv(vm), - DMA_RESV_USAGE_BOOKKEEP, - false, MAX_SCHEDULE_TIMEOUT); - XE_WARN_ON(err <= 0); - } - } + /* + * Just continue the operation since exec or rebind worker + * will take care of rebinding. + */ return 0; } @@ -1514,6 +1515,7 @@ static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma, .ops = &xe_pt_stage_unbind_ops, .shifts = xe_normal_pt_shifts, .max_level = XE_PT_HIGHEST_LEVEL, + .staging = true, }, .tile = tile, .modified_start = xe_vma_start(vma), @@ -1555,7 +1557,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma, { int i, j; - xe_pt_commit_locks_assert(vma); + xe_pt_commit_prepare_locks_assert(vma); for (i = num_entries - 1; i >= 0; --i) { struct xe_vm_pgtable_update *entry = &entries[i]; @@ -1568,7 +1570,7 @@ static void xe_pt_abort_unbind(struct xe_vma *vma, continue; for (j = entry->ofs; j < entry->ofs + entry->qwords; j++) - pt_dir->children[j] = + pt_dir->staging[j] = entries[i].pt_entries[j - entry->ofs].pt ? &entries[i].pt_entries[j - entry->ofs].pt->base : NULL; } @@ -1581,7 +1583,7 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma, { int i, j; - xe_pt_commit_locks_assert(vma); + xe_pt_commit_prepare_locks_assert(vma); for (i = 0; i < num_entries; ++i) { struct xe_vm_pgtable_update *entry = &entries[i]; @@ -1595,8 +1597,8 @@ xe_pt_commit_prepare_unbind(struct xe_vma *vma, pt_dir = as_xe_pt_dir(pt); for (j = entry->ofs; j < entry->ofs + entry->qwords; j++) { entry->pt_entries[j - entry->ofs].pt = - xe_pt_entry(pt_dir, j); - pt_dir->children[j] = NULL; + xe_pt_entry_staging(pt_dir, j); + pt_dir->staging[j] = NULL; } } } diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c b/drivers/gpu/drm/xe/xe_pt_walk.c index b8b3d2aea4923..be602a763ff32 100644 --- a/drivers/gpu/drm/xe/xe_pt_walk.c +++ b/drivers/gpu/drm/xe/xe_pt_walk.c @@ -74,7 +74,8 @@ int xe_pt_walk_range(struct xe_ptw *parent, unsigned int level, u64 addr, u64 end, struct xe_pt_walk *walk) { pgoff_t offset = xe_pt_offset(addr, level, walk); - struct xe_ptw **entries = parent->children ? parent->children : NULL; + struct xe_ptw **entries = walk->staging ? (parent->staging ?: NULL) : + (parent->children ?: NULL); const struct xe_pt_walk_ops *ops = walk->ops; enum page_walk_action action; struct xe_ptw *child; diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h b/drivers/gpu/drm/xe/xe_pt_walk.h index 5ecc4d2f0f653..5c02c244f7de3 100644 --- a/drivers/gpu/drm/xe/xe_pt_walk.h +++ b/drivers/gpu/drm/xe/xe_pt_walk.h @@ -11,12 +11,14 @@ /** * struct xe_ptw - base class for driver pagetable subclassing. * @children: Pointer to an array of children if any. + * @staging: Pointer to an array of staging if any. * * Drivers could subclass this, and if it's a page-directory, typically * embed an array of xe_ptw pointers. */ struct xe_ptw { struct xe_ptw **children; + struct xe_ptw **staging; }; /** @@ -41,6 +43,8 @@ struct xe_pt_walk { * as shared pagetables. */ bool shared_pt_mode; + /** @staging: Walk staging PT structure */ + bool staging; }; /** diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 690330352d4cd..ec6ec18ab3faa 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -579,51 +579,26 @@ static void preempt_rebind_work_func(struct work_struct *w) trace_xe_vm_rebind_worker_exit(vm); } -static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni, - const struct mmu_notifier_range *range, - unsigned long cur_seq) +static void __vma_userptr_invalidate(struct xe_vm *vm, struct xe_userptr_vma *uvma) { - struct xe_userptr *userptr = container_of(mni, typeof(*userptr), notifier); - struct xe_userptr_vma *uvma = container_of(userptr, typeof(*uvma), userptr); + struct xe_userptr *userptr = &uvma->userptr; struct xe_vma *vma = &uvma->vma; - struct xe_vm *vm = xe_vma_vm(vma); struct dma_resv_iter cursor; struct dma_fence *fence; long err; - xe_assert(vm->xe, xe_vma_is_userptr(vma)); - trace_xe_vma_userptr_invalidate(vma); - - if (!mmu_notifier_range_blockable(range)) - return false; - - vm_dbg(&xe_vma_vm(vma)->xe->drm, - "NOTIFIER: addr=0x%016llx, range=0x%016llx", - xe_vma_start(vma), xe_vma_size(vma)); - - down_write(&vm->userptr.notifier_lock); - mmu_interval_set_seq(mni, cur_seq); - - /* No need to stop gpu access if the userptr is not yet bound. */ - if (!userptr->initial_bind) { - up_write(&vm->userptr.notifier_lock); - return true; - } - /* * Tell exec and rebind worker they need to repin and rebind this * userptr. */ if (!xe_vm_in_fault_mode(vm) && - !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) { + !(vma->gpuva.flags & XE_VMA_DESTROYED)) { spin_lock(&vm->userptr.invalidated_lock); list_move_tail(&userptr->invalidate_link, &vm->userptr.invalidated); spin_unlock(&vm->userptr.invalidated_lock); } - up_write(&vm->userptr.notifier_lock); - /* * Preempt fences turn into schedule disables, pipeline these. * Note that even in fault mode, we need to wait for binds and @@ -641,11 +616,37 @@ static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni, false, MAX_SCHEDULE_TIMEOUT); XE_WARN_ON(err <= 0); - if (xe_vm_in_fault_mode(vm)) { + if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) { err = xe_vm_invalidate_vma(vma); XE_WARN_ON(err); } + xe_hmm_userptr_unmap(uvma); +} + +static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct xe_userptr_vma *uvma = container_of(mni, typeof(*uvma), userptr.notifier); + struct xe_vma *vma = &uvma->vma; + struct xe_vm *vm = xe_vma_vm(vma); + + xe_assert(vm->xe, xe_vma_is_userptr(vma)); + trace_xe_vma_userptr_invalidate(vma); + + if (!mmu_notifier_range_blockable(range)) + return false; + + vm_dbg(&xe_vma_vm(vma)->xe->drm, + "NOTIFIER: addr=0x%016llx, range=0x%016llx", + xe_vma_start(vma), xe_vma_size(vma)); + + down_write(&vm->userptr.notifier_lock); + mmu_interval_set_seq(mni, cur_seq); + + __vma_userptr_invalidate(vm, uvma); + up_write(&vm->userptr.notifier_lock); trace_xe_vma_userptr_invalidate_complete(vma); return true; @@ -655,6 +656,34 @@ static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = { .invalidate = vma_userptr_invalidate, }; +#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) +/** + * xe_vma_userptr_force_invalidate() - force invalidate a userptr + * @uvma: The userptr vma to invalidate + * + * Perform a forced userptr invalidation for testing purposes. + */ +void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma) +{ + struct xe_vm *vm = xe_vma_vm(&uvma->vma); + + /* Protect against concurrent userptr pinning */ + lockdep_assert_held(&vm->lock); + /* Protect against concurrent notifiers */ + lockdep_assert_held(&vm->userptr.notifier_lock); + /* + * Protect against concurrent instances of this function and + * the critical exec sections + */ + xe_vm_assert_held(vm); + + if (!mmu_interval_read_retry(&uvma->userptr.notifier, + uvma->userptr.notifier_seq)) + uvma->userptr.notifier_seq -= 2; + __vma_userptr_invalidate(vm, uvma); +} +#endif + int xe_vm_userptr_pin(struct xe_vm *vm) { struct xe_userptr_vma *uvma, *next; @@ -666,20 +695,33 @@ int xe_vm_userptr_pin(struct xe_vm *vm) /* Collect invalidated userptrs */ spin_lock(&vm->userptr.invalidated_lock); + xe_assert(vm->xe, list_empty(&vm->userptr.repin_list)); list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated, userptr.invalidate_link) { list_del_init(&uvma->userptr.invalidate_link); - list_move_tail(&uvma->userptr.repin_link, - &vm->userptr.repin_list); + list_add_tail(&uvma->userptr.repin_link, + &vm->userptr.repin_list); } spin_unlock(&vm->userptr.invalidated_lock); - /* Pin and move to temporary list */ + /* Pin and move to bind list */ list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list, userptr.repin_link) { err = xe_vma_userptr_pin_pages(uvma); if (err == -EFAULT) { list_del_init(&uvma->userptr.repin_link); + /* + * We might have already done the pin once already, but + * then had to retry before the re-bind happened, due + * some other condition in the caller, but in the + * meantime the userptr got dinged by the notifier such + * that we need to revalidate here, but this time we hit + * the EFAULT. In such a case make sure we remove + * ourselves from the rebind list to avoid going down in + * flames. + */ + if (!list_empty(&uvma->vma.combined_links.rebind)) + list_del_init(&uvma->vma.combined_links.rebind); /* Wait for pending binds */ xe_vm_lock(vm, false); @@ -690,10 +732,10 @@ int xe_vm_userptr_pin(struct xe_vm *vm) err = xe_vm_invalidate_vma(&uvma->vma); xe_vm_unlock(vm); if (err) - return err; + break; } else { - if (err < 0) - return err; + if (err) + break; list_del_init(&uvma->userptr.repin_link); list_move_tail(&uvma->vma.combined_links.rebind, @@ -701,7 +743,19 @@ int xe_vm_userptr_pin(struct xe_vm *vm) } } - return 0; + if (err) { + down_write(&vm->userptr.notifier_lock); + spin_lock(&vm->userptr.invalidated_lock); + list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list, + userptr.repin_link) { + list_del_init(&uvma->userptr.repin_link); + list_move_tail(&uvma->userptr.invalidate_link, + &vm->userptr.invalidated); + } + spin_unlock(&vm->userptr.invalidated_lock); + up_write(&vm->userptr.notifier_lock); + } + return err; } /** @@ -987,6 +1041,7 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm, INIT_LIST_HEAD(&userptr->invalidate_link); INIT_LIST_HEAD(&userptr->repin_link); vma->gpuva.gem.offset = bo_offset_or_userptr; + mutex_init(&userptr->unmap_mutex); err = mmu_interval_notifier_insert(&userptr->notifier, current->mm, @@ -1028,6 +1083,7 @@ static void xe_vma_destroy_late(struct xe_vma *vma) * them anymore */ mmu_interval_notifier_remove(&userptr->notifier); + mutex_destroy(&userptr->unmap_mutex); xe_vm_put(vm); } else if (xe_vma_is_null(vma)) { xe_vm_put(vm); @@ -1066,6 +1122,7 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED); spin_lock(&vm->userptr.invalidated_lock); + xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link)); list_del(&to_userptr_vma(vma)->userptr.invalidate_link); spin_unlock(&vm->userptr.invalidated_lock); } else if (!xe_vma_is_null(vma)) { @@ -2260,8 +2317,17 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct drm_gpuva_ops *ops, break; } case DRM_GPUVA_OP_UNMAP: + xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask); + break; case DRM_GPUVA_OP_PREFETCH: - /* FIXME: Need to skip some prefetch ops */ + vma = gpuva_to_vma(op->base.prefetch.va); + + if (xe_vma_is_userptr(vma)) { + err = xe_vma_userptr_pin_pages(to_userptr_vma(vma)); + if (err) + return err; + } + xe_vma_ops_incr_pt_update_ops(vops, op->tile_mask); break; default: diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 23adb74428815..b882bfb31bd05 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -274,9 +274,17 @@ static inline void vm_dbg(const struct drm_device *dev, const char *format, ...) { /* noop */ } #endif -#endif struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm); void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap); void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p); void xe_vm_snapshot_free(struct xe_vm_snapshot *snap); + +#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) +void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma); +#else +static inline void xe_vma_userptr_force_invalidate(struct xe_userptr_vma *uvma) +{ +} +#endif +#endif diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 7f9a303e51d89..a4b4091cfd0da 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -59,12 +59,16 @@ struct xe_userptr { struct sg_table *sg; /** @notifier_seq: notifier sequence number */ unsigned long notifier_seq; + /** @unmap_mutex: Mutex protecting dma-unmapping */ + struct mutex unmap_mutex; /** * @initial_bind: user pointer has been bound at least once. * write: vm->userptr.notifier_lock in read mode and vm->resv held. * read: vm->userptr.notifier_lock in write mode or vm->resv held. */ bool initial_bind; + /** @mapped: Whether the @sgt sg-table is dma-mapped. Protected by @unmap_mutex. */ + bool mapped; #if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) u32 divisor; #endif @@ -227,8 +231,8 @@ struct xe_vm { * up for revalidation. Protected from access with the * @invalidated_lock. Removing items from the list * additionally requires @lock in write mode, and adding - * items to the list requires the @userptr.notifer_lock in - * write mode. + * items to the list requires either the @userptr.notifer_lock in + * write mode, OR @lock in write mode. */ struct list_head invalidated; } userptr; diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 49812a76b7edd..d900dd05c335c 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -378,6 +378,12 @@ static bool apple_is_non_apple_keyboard(struct hid_device *hdev) return false; } +static bool apple_is_omoton_kb066(struct hid_device *hdev) +{ + return hdev->product == USB_DEVICE_ID_APPLE_ALU_WIRELESS_ANSI && + strcmp(hdev->name, "Bluetooth Keyboard") == 0; +} + static inline void apple_setup_key_translation(struct input_dev *input, const struct apple_key_translation *table) { @@ -546,9 +552,6 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } } - if (usage->hid == 0xc0301) /* Omoton KB066 quirk */ - code = KEY_F6; - if (usage->code != code) { input_event_with_scancode(input, usage->type, code, usage->hid, value); @@ -728,7 +731,7 @@ static int apple_input_configured(struct hid_device *hdev, { struct apple_sc *asc = hid_get_drvdata(hdev); - if ((asc->quirks & APPLE_HAS_FN) && !asc->fn_found) { + if (((asc->quirks & APPLE_HAS_FN) && !asc->fn_found) || apple_is_omoton_kb066(hdev)) { hid_info(hdev, "Fn key not found (Apple Wireless Keyboard clone?), disabling Fn key handling\n"); asc->quirks &= ~APPLE_HAS_FN; } diff --git a/drivers/hid/hid-appleir.c b/drivers/hid/hid-appleir.c index 8deded1857254..c45e5aa569d25 100644 --- a/drivers/hid/hid-appleir.c +++ b/drivers/hid/hid-appleir.c @@ -188,7 +188,7 @@ static int appleir_raw_event(struct hid_device *hid, struct hid_report *report, static const u8 flatbattery[] = { 0x25, 0x87, 0xe0 }; unsigned long flags; - if (len != 5) + if (len != 5 || !(hid->claimed & HID_CLAIMED_INPUT)) goto out; if (!memcmp(data, keydown, sizeof(keydown))) { diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index 56e858066c3c3..afbd67aa97192 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -71,11 +71,9 @@ #include #include -#include #include #include #include -#include #include #include #include @@ -120,6 +118,12 @@ enum { CORSAIR_VOID_BATTERY_CHARGING = 5, }; +enum { + CORSAIR_VOID_ADD_BATTERY = 0, + CORSAIR_VOID_REMOVE_BATTERY = 1, + CORSAIR_VOID_UPDATE_BATTERY = 2, +}; + static enum power_supply_property corsair_void_battery_props[] = { POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_PRESENT, @@ -155,12 +159,12 @@ struct corsair_void_drvdata { struct power_supply *battery; struct power_supply_desc battery_desc; - struct mutex battery_mutex; struct delayed_work delayed_status_work; struct delayed_work delayed_firmware_work; - struct work_struct battery_remove_work; - struct work_struct battery_add_work; + + unsigned long battery_work_flags; + struct work_struct battery_work; }; /* @@ -260,11 +264,9 @@ static void corsair_void_process_receiver(struct corsair_void_drvdata *drvdata, /* Inform power supply if battery values changed */ if (memcmp(&orig_battery_data, battery_data, sizeof(*battery_data))) { - scoped_guard(mutex, &drvdata->battery_mutex) { - if (drvdata->battery) { - power_supply_changed(drvdata->battery); - } - } + set_bit(CORSAIR_VOID_UPDATE_BATTERY, + &drvdata->battery_work_flags); + schedule_work(&drvdata->battery_work); } } @@ -536,29 +538,11 @@ static void corsair_void_firmware_work_handler(struct work_struct *work) } -static void corsair_void_battery_remove_work_handler(struct work_struct *work) -{ - struct corsair_void_drvdata *drvdata; - - drvdata = container_of(work, struct corsair_void_drvdata, - battery_remove_work); - scoped_guard(mutex, &drvdata->battery_mutex) { - if (drvdata->battery) { - power_supply_unregister(drvdata->battery); - drvdata->battery = NULL; - } - } -} - -static void corsair_void_battery_add_work_handler(struct work_struct *work) +static void corsair_void_add_battery(struct corsair_void_drvdata *drvdata) { - struct corsair_void_drvdata *drvdata; struct power_supply_config psy_cfg = {}; struct power_supply *new_supply; - drvdata = container_of(work, struct corsair_void_drvdata, - battery_add_work); - guard(mutex)(&drvdata->battery_mutex); if (drvdata->battery) return; @@ -583,16 +567,42 @@ static void corsair_void_battery_add_work_handler(struct work_struct *work) drvdata->battery = new_supply; } +static void corsair_void_battery_work_handler(struct work_struct *work) +{ + struct corsair_void_drvdata *drvdata = container_of(work, + struct corsair_void_drvdata, battery_work); + + bool add_battery = test_and_clear_bit(CORSAIR_VOID_ADD_BATTERY, + &drvdata->battery_work_flags); + bool remove_battery = test_and_clear_bit(CORSAIR_VOID_REMOVE_BATTERY, + &drvdata->battery_work_flags); + bool update_battery = test_and_clear_bit(CORSAIR_VOID_UPDATE_BATTERY, + &drvdata->battery_work_flags); + + if (add_battery && !remove_battery) { + corsair_void_add_battery(drvdata); + } else if (remove_battery && !add_battery && drvdata->battery) { + power_supply_unregister(drvdata->battery); + drvdata->battery = NULL; + } + + if (update_battery && drvdata->battery) + power_supply_changed(drvdata->battery); + +} + static void corsair_void_headset_connected(struct corsair_void_drvdata *drvdata) { - schedule_work(&drvdata->battery_add_work); + set_bit(CORSAIR_VOID_ADD_BATTERY, &drvdata->battery_work_flags); + schedule_work(&drvdata->battery_work); schedule_delayed_work(&drvdata->delayed_firmware_work, msecs_to_jiffies(100)); } static void corsair_void_headset_disconnected(struct corsair_void_drvdata *drvdata) { - schedule_work(&drvdata->battery_remove_work); + set_bit(CORSAIR_VOID_REMOVE_BATTERY, &drvdata->battery_work_flags); + schedule_work(&drvdata->battery_work); corsair_void_set_unknown_wireless_data(drvdata); corsair_void_set_unknown_batt(drvdata); @@ -678,13 +688,7 @@ static int corsair_void_probe(struct hid_device *hid_dev, drvdata->battery_desc.get_property = corsair_void_battery_get_property; drvdata->battery = NULL; - INIT_WORK(&drvdata->battery_remove_work, - corsair_void_battery_remove_work_handler); - INIT_WORK(&drvdata->battery_add_work, - corsair_void_battery_add_work_handler); - ret = devm_mutex_init(drvdata->dev, &drvdata->battery_mutex); - if (ret) - return ret; + INIT_WORK(&drvdata->battery_work, corsair_void_battery_work_handler); ret = sysfs_create_group(&hid_dev->dev.kobj, &corsair_void_attr_group); if (ret) @@ -721,8 +725,7 @@ static void corsair_void_remove(struct hid_device *hid_dev) struct corsair_void_drvdata *drvdata = hid_get_drvdata(hid_dev); hid_hw_stop(hid_dev); - cancel_work_sync(&drvdata->battery_remove_work); - cancel_work_sync(&drvdata->battery_add_work); + cancel_work_sync(&drvdata->battery_work); if (drvdata->battery) power_supply_unregister(drvdata->battery); diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 541d682af15aa..8433306148d57 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -3450,7 +3450,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_MACRO_RECORD_START] = "MacroRecordStart", [KEY_MACRO_RECORD_STOP] = "MacroRecordStop", [KEY_MARK_WAYPOINT] = "MarkWayPoint", [KEY_MEDIA_REPEAT] = "MediaRepeat", - [KEY_MEDIA_TOP_MENU] = "MediaTopMenu", [KEY_MESSENGER] = "Messanger", + [KEY_MEDIA_TOP_MENU] = "MediaTopMenu", [KEY_MESSENGER] = "Messenger", [KEY_NAV_CHART] = "NavChar", [KEY_NAV_INFO] = "NavInfo", [KEY_NEWS] = "News", [KEY_NEXT_ELEMENT] = "NextElement", [KEY_NEXT_FAVORITE] = "NextFavorite", [KEY_NOTIFICATION_CENTER] = "NotificationCenter", diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c index 0f292b5d3e26d..eb6fd2dc75d0a 100644 --- a/drivers/hid/hid-google-hammer.c +++ b/drivers/hid/hid-google-hammer.c @@ -268,11 +268,13 @@ static void cbas_ec_remove(struct platform_device *pdev) mutex_unlock(&cbas_ec_reglock); } +#ifdef CONFIG_ACPI static const struct acpi_device_id cbas_ec_acpi_ids[] = { { "GOOG000B", 0 }, { } }; MODULE_DEVICE_TABLE(acpi, cbas_ec_acpi_ids); +#endif #ifdef CONFIG_OF static const struct of_device_id cbas_ec_of_match[] = { diff --git a/drivers/hid/hid-nintendo.c b/drivers/hid/hid-nintendo.c index 11ac246176ae1..839d5bcd72b1e 100644 --- a/drivers/hid/hid-nintendo.c +++ b/drivers/hid/hid-nintendo.c @@ -457,13 +457,13 @@ static const struct joycon_ctlr_button_mapping snescon_button_mappings[] = { }; static const struct joycon_ctlr_button_mapping gencon_button_mappings[] = { - { BTN_A, JC_BTN_A, }, - { BTN_B, JC_BTN_B, }, - { BTN_C, JC_BTN_R, }, - { BTN_X, JC_BTN_X, }, /* MD/GEN 6B Only */ - { BTN_Y, JC_BTN_Y, }, /* MD/GEN 6B Only */ - { BTN_Z, JC_BTN_L, }, /* MD/GEN 6B Only */ - { BTN_SELECT, JC_BTN_ZR, }, + { BTN_WEST, JC_BTN_A, }, /* A */ + { BTN_SOUTH, JC_BTN_B, }, /* B */ + { BTN_EAST, JC_BTN_R, }, /* C */ + { BTN_TL, JC_BTN_X, }, /* X MD/GEN 6B Only */ + { BTN_NORTH, JC_BTN_Y, }, /* Y MD/GEN 6B Only */ + { BTN_TR, JC_BTN_L, }, /* Z MD/GEN 6B Only */ + { BTN_SELECT, JC_BTN_ZR, }, /* Mode */ { BTN_START, JC_BTN_PLUS, }, { BTN_MODE, JC_BTN_HOME, }, { BTN_Z, JC_BTN_CAP, }, diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index c9e65e9088b31..10460b7bde1a2 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -1327,11 +1327,11 @@ static void steam_remove(struct hid_device *hdev) return; } + hid_destroy_device(steam->client_hdev); cancel_delayed_work_sync(&steam->mode_switch); cancel_work_sync(&steam->work_connect); cancel_work_sync(&steam->rumble_work); cancel_work_sync(&steam->unregister_work); - hid_destroy_device(steam->client_hdev); steam->client_hdev = NULL; steam->client_opened = 0; if (steam->quirks & STEAM_QUIRK_WIRELESS) { diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 75544448c2393..d3912e3f2f13a 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -290,7 +290,7 @@ static int i2c_hid_get_report(struct i2c_hid *ihid, ihid->rawbuf, recv_len + sizeof(__le16)); if (error) { dev_err(&ihid->client->dev, - "failed to set a report to device: %d\n", error); + "failed to get a report from device: %d\n", error); return error; } diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c index cb04cd1d980bd..6550ad5bfbb53 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c @@ -832,9 +832,9 @@ static void hid_ishtp_cl_remove(struct ishtp_cl_device *cl_device) hid_ishtp_cl); dev_dbg(ishtp_device(cl_device), "%s\n", __func__); - hid_ishtp_cl_deinit(hid_ishtp_cl); ishtp_put_device(cl_device); ishtp_hid_remove(client_data); + hid_ishtp_cl_deinit(hid_ishtp_cl); hid_ishtp_cl = NULL; diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.c b/drivers/hid/intel-ish-hid/ishtp-hid.c index 00c6f0ebf3563..be2c62fc8251d 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid.c @@ -261,12 +261,14 @@ int ishtp_hid_probe(unsigned int cur_hid_dev, */ void ishtp_hid_remove(struct ishtp_cl_data *client_data) { + void *data; int i; for (i = 0; i < client_data->num_hid_devices; ++i) { if (client_data->hid_sensor_hubs[i]) { - kfree(client_data->hid_sensor_hubs[i]->driver_data); + data = client_data->hid_sensor_hubs[i]->driver_data; hid_destroy_device(client_data->hid_sensor_hubs[i]); + kfree(data); client_data->hid_sensor_hubs[i] = NULL; } } diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c index 4641e818dfa44..6b2c7620be2b1 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/pci-quickspi.c @@ -909,6 +909,8 @@ static int quickspi_restore(struct device *device) thc_change_ltr_mode(qsdev->thc_hw, THC_LTR_MODE_ACTIVE); + qsdev->state = QUICKSPI_ENABLED; + return 0; } diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c index 7373238ceb18b..918050af73e55 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c @@ -107,7 +107,7 @@ static int quickspi_get_device_descriptor(struct quickspi_device *qsdev) return 0; } - dev_err_once(qsdev->dev, "Unexpected intput report type: %d\n", input_rep_type); + dev_err_once(qsdev->dev, "Unexpected input report type: %d\n", input_rep_type); return -EINVAL; } diff --git a/drivers/hwmon/ad7314.c b/drivers/hwmon/ad7314.c index 7802bbf5f9587..59424103f6348 100644 --- a/drivers/hwmon/ad7314.c +++ b/drivers/hwmon/ad7314.c @@ -22,11 +22,13 @@ */ #define AD7314_TEMP_MASK 0x7FE0 #define AD7314_TEMP_SHIFT 5 +#define AD7314_LEADING_ZEROS_MASK BIT(15) /* * ADT7301 and ADT7302 temperature masks */ #define ADT7301_TEMP_MASK 0x3FFF +#define ADT7301_LEADING_ZEROS_MASK (BIT(15) | BIT(14)) enum ad7314_variant { adt7301, @@ -65,12 +67,20 @@ static ssize_t ad7314_temperature_show(struct device *dev, return ret; switch (spi_get_device_id(chip->spi_dev)->driver_data) { case ad7314: + if (ret & AD7314_LEADING_ZEROS_MASK) { + /* Invalid read-out, leading zero part is missing */ + return -EIO; + } data = (ret & AD7314_TEMP_MASK) >> AD7314_TEMP_SHIFT; data = sign_extend32(data, 9); return sprintf(buf, "%d\n", 250 * data); case adt7301: case adt7302: + if (ret & ADT7301_LEADING_ZEROS_MASK) { + /* Invalid read-out, leading zero part is missing */ + return -EIO; + } /* * Documented as a 13 bit twos complement register * with a sign bit - which is a 14 bit 2's complement diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c index b5352900463fb..0d29c8f97ba7c 100644 --- a/drivers/hwmon/ntc_thermistor.c +++ b/drivers/hwmon/ntc_thermistor.c @@ -181,40 +181,40 @@ static const struct ntc_compensation ncpXXwf104[] = { }; static const struct ntc_compensation ncpXXxh103[] = { - { .temp_c = -40, .ohm = 247565 }, - { .temp_c = -35, .ohm = 181742 }, - { .temp_c = -30, .ohm = 135128 }, - { .temp_c = -25, .ohm = 101678 }, - { .temp_c = -20, .ohm = 77373 }, - { .temp_c = -15, .ohm = 59504 }, - { .temp_c = -10, .ohm = 46222 }, - { .temp_c = -5, .ohm = 36244 }, - { .temp_c = 0, .ohm = 28674 }, - { .temp_c = 5, .ohm = 22878 }, - { .temp_c = 10, .ohm = 18399 }, - { .temp_c = 15, .ohm = 14910 }, - { .temp_c = 20, .ohm = 12169 }, + { .temp_c = -40, .ohm = 195652 }, + { .temp_c = -35, .ohm = 148171 }, + { .temp_c = -30, .ohm = 113347 }, + { .temp_c = -25, .ohm = 87559 }, + { .temp_c = -20, .ohm = 68237 }, + { .temp_c = -15, .ohm = 53650 }, + { .temp_c = -10, .ohm = 42506 }, + { .temp_c = -5, .ohm = 33892 }, + { .temp_c = 0, .ohm = 27219 }, + { .temp_c = 5, .ohm = 22021 }, + { .temp_c = 10, .ohm = 17926 }, + { .temp_c = 15, .ohm = 14674 }, + { .temp_c = 20, .ohm = 12081 }, { .temp_c = 25, .ohm = 10000 }, - { .temp_c = 30, .ohm = 8271 }, - { .temp_c = 35, .ohm = 6883 }, - { .temp_c = 40, .ohm = 5762 }, - { .temp_c = 45, .ohm = 4851 }, - { .temp_c = 50, .ohm = 4105 }, - { .temp_c = 55, .ohm = 3492 }, - { .temp_c = 60, .ohm = 2985 }, - { .temp_c = 65, .ohm = 2563 }, - { .temp_c = 70, .ohm = 2211 }, - { .temp_c = 75, .ohm = 1915 }, - { .temp_c = 80, .ohm = 1666 }, - { .temp_c = 85, .ohm = 1454 }, - { .temp_c = 90, .ohm = 1275 }, - { .temp_c = 95, .ohm = 1121 }, - { .temp_c = 100, .ohm = 990 }, - { .temp_c = 105, .ohm = 876 }, - { .temp_c = 110, .ohm = 779 }, - { .temp_c = 115, .ohm = 694 }, - { .temp_c = 120, .ohm = 620 }, - { .temp_c = 125, .ohm = 556 }, + { .temp_c = 30, .ohm = 8315 }, + { .temp_c = 35, .ohm = 6948 }, + { .temp_c = 40, .ohm = 5834 }, + { .temp_c = 45, .ohm = 4917 }, + { .temp_c = 50, .ohm = 4161 }, + { .temp_c = 55, .ohm = 3535 }, + { .temp_c = 60, .ohm = 3014 }, + { .temp_c = 65, .ohm = 2586 }, + { .temp_c = 70, .ohm = 2228 }, + { .temp_c = 75, .ohm = 1925 }, + { .temp_c = 80, .ohm = 1669 }, + { .temp_c = 85, .ohm = 1452 }, + { .temp_c = 90, .ohm = 1268 }, + { .temp_c = 95, .ohm = 1110 }, + { .temp_c = 100, .ohm = 974 }, + { .temp_c = 105, .ohm = 858 }, + { .temp_c = 110, .ohm = 758 }, + { .temp_c = 115, .ohm = 672 }, + { .temp_c = 120, .ohm = 596 }, + { .temp_c = 125, .ohm = 531 }, }; /* diff --git a/drivers/hwmon/peci/dimmtemp.c b/drivers/hwmon/peci/dimmtemp.c index d6762259dd69c..fbe82d9852e01 100644 --- a/drivers/hwmon/peci/dimmtemp.c +++ b/drivers/hwmon/peci/dimmtemp.c @@ -127,8 +127,6 @@ static int update_thresholds(struct peci_dimmtemp *priv, int dimm_no) return 0; ret = priv->gen_info->read_thresholds(priv, dimm_order, chan_rank, &data); - if (ret == -ENODATA) /* Use default or previous value */ - return 0; if (ret) return ret; @@ -509,11 +507,11 @@ read_thresholds_icx(struct peci_dimmtemp *priv, int dimm_order, int chan_rank, u ret = peci_ep_pci_local_read(priv->peci_dev, 0, 13, 0, 2, 0xd4, ®_val); if (ret || !(reg_val & BIT(31))) - return -ENODATA; /* Use default or previous value */ + return -ENODATA; ret = peci_ep_pci_local_read(priv->peci_dev, 0, 13, 0, 2, 0xd0, ®_val); if (ret) - return -ENODATA; /* Use default or previous value */ + return -ENODATA; /* * Device 26, Offset 224e0: IMC 0 channel 0 -> rank 0 @@ -546,11 +544,11 @@ read_thresholds_spr(struct peci_dimmtemp *priv, int dimm_order, int chan_rank, u ret = peci_ep_pci_local_read(priv->peci_dev, 0, 30, 0, 2, 0xd4, ®_val); if (ret || !(reg_val & BIT(31))) - return -ENODATA; /* Use default or previous value */ + return -ENODATA; ret = peci_ep_pci_local_read(priv->peci_dev, 0, 30, 0, 2, 0xd0, ®_val); if (ret) - return -ENODATA; /* Use default or previous value */ + return -ENODATA; /* * Device 26, Offset 219a8: IMC 0 channel 0 -> rank 0 diff --git a/drivers/hwmon/pmbus/pmbus.c b/drivers/hwmon/pmbus/pmbus.c index 77cf268e7d2d6..920cd5408141a 100644 --- a/drivers/hwmon/pmbus/pmbus.c +++ b/drivers/hwmon/pmbus/pmbus.c @@ -103,6 +103,8 @@ static int pmbus_identify(struct i2c_client *client, if (pmbus_check_byte_register(client, 0, PMBUS_PAGE)) { int page; + info->pages = PMBUS_PAGES; + for (page = 1; page < PMBUS_PAGES; page++) { if (pmbus_set_page(client, page, 0xff) < 0) break; diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index 1e3bd129a922d..7087197383c96 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -706,7 +706,7 @@ static int xgene_hwmon_probe(struct platform_device *pdev) goto out; } - if (!ctx->pcc_comm_addr) { + if (IS_ERR_OR_NULL(ctx->pcc_comm_addr)) { dev_err(&pdev->dev, "Failed to ioremap PCC comm region\n"); rc = -ENOMEM; diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 66123d684ac9e..bf99d79a41920 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -105,23 +105,32 @@ struct msc_iter { /** * struct msc - MSC device representation - * @reg_base: register window base address + * @reg_base: register window base address for the entire MSU + * @msu_base: register window base address for this MSC * @thdev: intel_th_device pointer * @mbuf: MSU buffer, if assigned - * @mbuf_priv MSU buffer's private data, if @mbuf + * @mbuf_priv: MSU buffer's private data, if @mbuf + * @work: a work to stop the trace when the buffer is full * @win_list: list of windows in multiblock mode * @single_sgt: single mode buffer * @cur_win: current window + * @switch_on_unlock: window to switch to when it becomes available * @nr_pages: total number of pages allocated for this buffer * @single_sz: amount of data in single mode * @single_wrap: single mode wrap occurred * @base: buffer's base pointer * @base_addr: buffer's base address + * @orig_addr: MSC0 buffer's base address + * @orig_sz: MSC0 buffer's size * @user_count: number of users of the buffer * @mmap_count: number of mappings * @buf_mutex: mutex to serialize access to buffer-related bits + * @iter_list: list of open file descriptor iterators + * @stop_on_full: stop the trace if the current window is full * @enabled: MSC is enabled * @wrap: wrapping is enabled + * @do_irq: IRQ resource is available, handle interrupts + * @multi_is_broken: multiblock mode enabled (not disabled by PCI drvdata) * @mode: MSC operating mode * @burst_len: write burst length * @index: number of this MSC in the MSU diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index e9d8d28e055f3..e3def163d5cf7 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -334,6 +334,21 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa824), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Arrow Lake */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7724), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, + { + /* Panther Lake-H */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe324), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, + { + /* Panther Lake-P/U */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe424), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { /* Alder Lake CPU */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f), diff --git a/drivers/hwtracing/stm/heartbeat.c b/drivers/hwtracing/stm/heartbeat.c index e9496fe97baa1..495eb1dc8ac58 100644 --- a/drivers/hwtracing/stm/heartbeat.c +++ b/drivers/hwtracing/stm/heartbeat.c @@ -81,10 +81,8 @@ static int stm_heartbeat_init(void) stm_heartbeat[i].data.type = STM_USER; stm_heartbeat[i].data.link = stm_heartbeat_link; stm_heartbeat[i].data.unlink = stm_heartbeat_unlink; - hrtimer_init(&stm_heartbeat[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - stm_heartbeat[i].hrtimer.function = - stm_heartbeat_hrtimer_handler; + hrtimer_setup(&stm_heartbeat[i].hrtimer, stm_heartbeat_hrtimer_handler, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ret = stm_source_register_device(NULL, &stm_heartbeat[i].data); if (ret) diff --git a/drivers/i2c/busses/i2c-amd-asf-plat.c b/drivers/i2c/busses/i2c-amd-asf-plat.c index 7512614bf4b73..93ebec162c6dd 100644 --- a/drivers/i2c/busses/i2c-amd-asf-plat.c +++ b/drivers/i2c/busses/i2c-amd-asf-plat.c @@ -293,6 +293,7 @@ static irqreturn_t amd_asf_irq_handler(int irq, void *ptr) amd_asf_update_ioport_target(piix4_smba, ASF_SLV_INTR, SMBHSTSTS, true); } + iowrite32(irq, dev->eoi_base); return IRQ_HANDLED; } diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index ee0d25b498cb2..9e5d454d8318e 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1723,8 +1723,8 @@ static int i2c_imx_probe(struct platform_device *pdev) return -ENOMEM; spin_lock_init(&i2c_imx->slave_lock); - hrtimer_init(&i2c_imx->slave_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - i2c_imx->slave_timer.function = i2c_imx_slave_timeout; + hrtimer_setup(&i2c_imx->slave_timer, i2c_imx_slave_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); match = device_get_match_data(&pdev->dev); if (match) diff --git a/drivers/i2c/busses/i2c-ls2x.c b/drivers/i2c/busses/i2c-ls2x.c index 8821cac3897b6..b475dd27b7af9 100644 --- a/drivers/i2c/busses/i2c-ls2x.c +++ b/drivers/i2c/busses/i2c-ls2x.c @@ -10,6 +10,7 @@ * Rewritten for mainline by Binbin Zhou */ +#include #include #include #include @@ -26,7 +27,8 @@ #include /* I2C Registers */ -#define I2C_LS2X_PRER 0x0 /* Freq Division Register(16 bits) */ +#define I2C_LS2X_PRER_LO 0x0 /* Freq Division Low Byte Register */ +#define I2C_LS2X_PRER_HI 0x1 /* Freq Division High Byte Register */ #define I2C_LS2X_CTR 0x2 /* Control Register */ #define I2C_LS2X_TXR 0x3 /* Transport Data Register */ #define I2C_LS2X_RXR 0x3 /* Receive Data Register */ @@ -93,6 +95,7 @@ static irqreturn_t ls2x_i2c_isr(int this_irq, void *dev_id) */ static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv) { + u16 val; struct i2c_timings *t = &priv->i2c_t; struct device *dev = priv->adapter.dev.parent; u32 acpi_speed = i2c_acpi_find_bus_speed(dev); @@ -104,9 +107,14 @@ static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv) else t->bus_freq_hz = LS2X_I2C_FREQ_STD; - /* Calculate and set i2c frequency. */ - writew(LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1, - priv->base + I2C_LS2X_PRER); + /* + * According to the chip manual, we can only access the registers as bytes, + * otherwise the high bits will be truncated. + * So set the I2C frequency with a sequential writeb() instead of writew(). + */ + val = LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1; + writeb(FIELD_GET(GENMASK(7, 0), val), priv->base + I2C_LS2X_PRER_LO); + writeb(FIELD_GET(GENMASK(15, 8), val), priv->base + I2C_LS2X_PRER_HI); } static void ls2x_i2c_init(struct ls2x_i2c_priv *priv) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 3ca08b8ef8af3..de713b5747fe5 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -2554,6 +2554,13 @@ static int npcm_i2c_probe_bus(struct platform_device *pdev) if (irq < 0) return irq; + /* + * Disable the interrupt to avoid the interrupt handler being triggered + * incorrectly by the asynchronous interrupt status since the machine + * might do a warm reset during the last smbus/i2c transfer session. + */ + npcm_i2c_int_enable(bus, false); + ret = devm_request_irq(bus->dev, irq, npcm_i2c_bus_irq, 0, dev_name(bus->dev), bus); if (ret) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 35a221e2c11c1..7ad1ad5c8c3f5 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -2506,7 +2506,7 @@ static int i2c_detect_address(struct i2c_client *temp_client, static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) { const unsigned short *address_list; - struct i2c_client temp_client; + struct i2c_client *temp_client; int i, err = 0; address_list = driver->address_list; @@ -2527,19 +2527,24 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) return 0; /* Set up a temporary client to help detect callback */ - memset(&temp_client, 0, sizeof(temp_client)); - temp_client.adapter = adapter; + temp_client = kzalloc(sizeof(*temp_client), GFP_KERNEL); + if (!temp_client) + return -ENOMEM; + + temp_client->adapter = adapter; for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) { dev_dbg(&adapter->dev, "found normal entry for adapter %d, addr 0x%02x\n", i2c_adapter_id(adapter), address_list[i]); - temp_client.addr = address_list[i]; - err = i2c_detect_address(&temp_client, driver); + temp_client->addr = address_list[i]; + err = i2c_detect_address(temp_client, driver); if (unlikely(err)) break; } + kfree(temp_client); + return err; } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 118fe1d37c226..5687089e406a6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -56,7 +56,9 @@ #include #include #include +#include #include +#include #define INTEL_IDLE_VERSION "0.5.1" @@ -228,6 +230,15 @@ static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, return 0; } +static void intel_idle_enter_dead(struct cpuidle_device *dev, int index) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + struct cpuidle_state *state = &drv->states[index]; + unsigned long eax = flg2MWAIT(state->flags); + + mwait_play_dead(eax); +} + /* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. @@ -1799,7 +1810,11 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) if (intel_idle_state_needs_timer_stop(state)) state->flags |= CPUIDLE_FLAG_TIMER_STOP; + if (cx->type > ACPI_STATE_C1 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halts in idle"); + state->enter = intel_idle; + state->enter_dead = intel_idle_enter_dead; state->enter_s2idle = intel_idle_s2idle; } } @@ -2149,6 +2164,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) !cpuidle_state_table[cstate].enter_s2idle) break; + if (!cpuidle_state_table[cstate].enter_dead) + cpuidle_state_table[cstate].enter_dead = intel_idle_enter_dead; + /* If marked as unusable, skip this state. */ if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) { pr_debug("state %s is disabled\n", diff --git a/drivers/iio/adc/ad7192.c b/drivers/iio/adc/ad7192.c index e96a5ae92375d..cfaf8f7e0a07d 100644 --- a/drivers/iio/adc/ad7192.c +++ b/drivers/iio/adc/ad7192.c @@ -1084,7 +1084,7 @@ static int ad7192_update_scan_mode(struct iio_dev *indio_dev, const unsigned lon conf &= ~AD7192_CONF_CHAN_MASK; for_each_set_bit(i, scan_mask, 8) - conf |= FIELD_PREP(AD7192_CONF_CHAN_MASK, i); + conf |= FIELD_PREP(AD7192_CONF_CHAN_MASK, BIT(i)); ret = ad_sd_write_reg(&st->sd, AD7192_REG_CONF, 3, conf); if (ret < 0) diff --git a/drivers/iio/adc/ad7606.c b/drivers/iio/adc/ad7606.c index d8e3c7a43678c..d39354afd5394 100644 --- a/drivers/iio/adc/ad7606.c +++ b/drivers/iio/adc/ad7606.c @@ -1047,7 +1047,7 @@ static int ad7606_read_avail(struct iio_dev *indio_dev, cs = &st->chan_scales[ch]; *vals = (int *)cs->scale_avail; - *length = cs->num_scales; + *length = cs->num_scales * 2; *type = IIO_VAL_INT_PLUS_MICRO; return IIO_AVAIL_LIST; diff --git a/drivers/iio/adc/at91-sama5d2_adc.c b/drivers/iio/adc/at91-sama5d2_adc.c index 8e5aaf15a9215..c3a1dea2aa82e 100644 --- a/drivers/iio/adc/at91-sama5d2_adc.c +++ b/drivers/iio/adc/at91-sama5d2_adc.c @@ -329,7 +329,7 @@ static const struct at91_adc_reg_layout sama7g5_layout = { #define AT91_HWFIFO_MAX_SIZE_STR "128" #define AT91_HWFIFO_MAX_SIZE 128 -#define AT91_SAMA5D2_CHAN_SINGLE(index, num, addr) \ +#define AT91_SAMA_CHAN_SINGLE(index, num, addr, rbits) \ { \ .type = IIO_VOLTAGE, \ .channel = num, \ @@ -337,7 +337,7 @@ static const struct at91_adc_reg_layout sama7g5_layout = { .scan_index = index, \ .scan_type = { \ .sign = 'u', \ - .realbits = 14, \ + .realbits = rbits, \ .storagebits = 16, \ }, \ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW), \ @@ -350,7 +350,13 @@ static const struct at91_adc_reg_layout sama7g5_layout = { .indexed = 1, \ } -#define AT91_SAMA5D2_CHAN_DIFF(index, num, num2, addr) \ +#define AT91_SAMA5D2_CHAN_SINGLE(index, num, addr) \ + AT91_SAMA_CHAN_SINGLE(index, num, addr, 14) + +#define AT91_SAMA7G5_CHAN_SINGLE(index, num, addr) \ + AT91_SAMA_CHAN_SINGLE(index, num, addr, 16) + +#define AT91_SAMA_CHAN_DIFF(index, num, num2, addr, rbits) \ { \ .type = IIO_VOLTAGE, \ .differential = 1, \ @@ -360,7 +366,7 @@ static const struct at91_adc_reg_layout sama7g5_layout = { .scan_index = index, \ .scan_type = { \ .sign = 's', \ - .realbits = 14, \ + .realbits = rbits, \ .storagebits = 16, \ }, \ .info_mask_separate = BIT(IIO_CHAN_INFO_RAW), \ @@ -373,6 +379,12 @@ static const struct at91_adc_reg_layout sama7g5_layout = { .indexed = 1, \ } +#define AT91_SAMA5D2_CHAN_DIFF(index, num, num2, addr) \ + AT91_SAMA_CHAN_DIFF(index, num, num2, addr, 14) + +#define AT91_SAMA7G5_CHAN_DIFF(index, num, num2, addr) \ + AT91_SAMA_CHAN_DIFF(index, num, num2, addr, 16) + #define AT91_SAMA5D2_CHAN_TOUCH(num, name, mod) \ { \ .type = IIO_POSITIONRELATIVE, \ @@ -666,30 +678,30 @@ static const struct iio_chan_spec at91_sama5d2_adc_channels[] = { }; static const struct iio_chan_spec at91_sama7g5_adc_channels[] = { - AT91_SAMA5D2_CHAN_SINGLE(0, 0, 0x60), - AT91_SAMA5D2_CHAN_SINGLE(1, 1, 0x64), - AT91_SAMA5D2_CHAN_SINGLE(2, 2, 0x68), - AT91_SAMA5D2_CHAN_SINGLE(3, 3, 0x6c), - AT91_SAMA5D2_CHAN_SINGLE(4, 4, 0x70), - AT91_SAMA5D2_CHAN_SINGLE(5, 5, 0x74), - AT91_SAMA5D2_CHAN_SINGLE(6, 6, 0x78), - AT91_SAMA5D2_CHAN_SINGLE(7, 7, 0x7c), - AT91_SAMA5D2_CHAN_SINGLE(8, 8, 0x80), - AT91_SAMA5D2_CHAN_SINGLE(9, 9, 0x84), - AT91_SAMA5D2_CHAN_SINGLE(10, 10, 0x88), - AT91_SAMA5D2_CHAN_SINGLE(11, 11, 0x8c), - AT91_SAMA5D2_CHAN_SINGLE(12, 12, 0x90), - AT91_SAMA5D2_CHAN_SINGLE(13, 13, 0x94), - AT91_SAMA5D2_CHAN_SINGLE(14, 14, 0x98), - AT91_SAMA5D2_CHAN_SINGLE(15, 15, 0x9c), - AT91_SAMA5D2_CHAN_DIFF(16, 0, 1, 0x60), - AT91_SAMA5D2_CHAN_DIFF(17, 2, 3, 0x68), - AT91_SAMA5D2_CHAN_DIFF(18, 4, 5, 0x70), - AT91_SAMA5D2_CHAN_DIFF(19, 6, 7, 0x78), - AT91_SAMA5D2_CHAN_DIFF(20, 8, 9, 0x80), - AT91_SAMA5D2_CHAN_DIFF(21, 10, 11, 0x88), - AT91_SAMA5D2_CHAN_DIFF(22, 12, 13, 0x90), - AT91_SAMA5D2_CHAN_DIFF(23, 14, 15, 0x98), + AT91_SAMA7G5_CHAN_SINGLE(0, 0, 0x60), + AT91_SAMA7G5_CHAN_SINGLE(1, 1, 0x64), + AT91_SAMA7G5_CHAN_SINGLE(2, 2, 0x68), + AT91_SAMA7G5_CHAN_SINGLE(3, 3, 0x6c), + AT91_SAMA7G5_CHAN_SINGLE(4, 4, 0x70), + AT91_SAMA7G5_CHAN_SINGLE(5, 5, 0x74), + AT91_SAMA7G5_CHAN_SINGLE(6, 6, 0x78), + AT91_SAMA7G5_CHAN_SINGLE(7, 7, 0x7c), + AT91_SAMA7G5_CHAN_SINGLE(8, 8, 0x80), + AT91_SAMA7G5_CHAN_SINGLE(9, 9, 0x84), + AT91_SAMA7G5_CHAN_SINGLE(10, 10, 0x88), + AT91_SAMA7G5_CHAN_SINGLE(11, 11, 0x8c), + AT91_SAMA7G5_CHAN_SINGLE(12, 12, 0x90), + AT91_SAMA7G5_CHAN_SINGLE(13, 13, 0x94), + AT91_SAMA7G5_CHAN_SINGLE(14, 14, 0x98), + AT91_SAMA7G5_CHAN_SINGLE(15, 15, 0x9c), + AT91_SAMA7G5_CHAN_DIFF(16, 0, 1, 0x60), + AT91_SAMA7G5_CHAN_DIFF(17, 2, 3, 0x68), + AT91_SAMA7G5_CHAN_DIFF(18, 4, 5, 0x70), + AT91_SAMA7G5_CHAN_DIFF(19, 6, 7, 0x78), + AT91_SAMA7G5_CHAN_DIFF(20, 8, 9, 0x80), + AT91_SAMA7G5_CHAN_DIFF(21, 10, 11, 0x88), + AT91_SAMA7G5_CHAN_DIFF(22, 12, 13, 0x90), + AT91_SAMA7G5_CHAN_DIFF(23, 14, 15, 0x98), IIO_CHAN_SOFT_TIMESTAMP(24), AT91_SAMA5D2_CHAN_TEMP(AT91_SAMA7G5_ADC_TEMP_CHANNEL, "temp", 0xdc), }; diff --git a/drivers/iio/adc/pac1921.c b/drivers/iio/adc/pac1921.c index 90f61c47b1c46..63f5182151565 100644 --- a/drivers/iio/adc/pac1921.c +++ b/drivers/iio/adc/pac1921.c @@ -1198,11 +1198,11 @@ static int pac1921_match_acpi_device(struct iio_dev *indio_dev) label = devm_kstrdup(dev, status->package.elements[0].string.pointer, GFP_KERNEL); + ACPI_FREE(status); if (!label) return -ENOMEM; indio_dev->label = label; - ACPI_FREE(status); return 0; } diff --git a/drivers/iio/adc/ti-tsc2046.c b/drivers/iio/adc/ti-tsc2046.c index 7dde5713973f9..49560059f4b7b 100644 --- a/drivers/iio/adc/ti-tsc2046.c +++ b/drivers/iio/adc/ti-tsc2046.c @@ -812,9 +812,7 @@ static int tsc2046_adc_probe(struct spi_device *spi) spin_lock_init(&priv->state_lock); priv->state = TSC2046_STATE_SHUTDOWN; - hrtimer_init(&priv->trig_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - priv->trig_timer.function = tsc2046_adc_timer; + hrtimer_setup(&priv->trig_timer, tsc2046_adc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); ret = devm_iio_trigger_register(dev, trig); if (ret) { diff --git a/drivers/iio/dac/ad3552r.c b/drivers/iio/dac/ad3552r.c index e7206af53af61..7944f5c1d264d 100644 --- a/drivers/iio/dac/ad3552r.c +++ b/drivers/iio/dac/ad3552r.c @@ -410,6 +410,12 @@ static int ad3552r_reset(struct ad3552r_desc *dac) return ret; } + /* Clear reset error flag, see ad3552r manual, rev B table 38. */ + ret = ad3552r_write_reg(dac, AD3552R_REG_ADDR_ERR_STATUS, + AD3552R_MASK_RESET_STATUS); + if (ret) + return ret; + return ad3552r_update_reg_field(dac, AD3552R_REG_ADDR_INTERFACE_CONFIG_A, AD3552R_MASK_ADDR_ASCENSION, diff --git a/drivers/iio/filter/admv8818.c b/drivers/iio/filter/admv8818.c index 848baa6e3bbf5..d85b7d3de8660 100644 --- a/drivers/iio/filter/admv8818.c +++ b/drivers/iio/filter/admv8818.c @@ -574,21 +574,15 @@ static int admv8818_init(struct admv8818_state *st) struct spi_device *spi = st->spi; unsigned int chip_id; - ret = regmap_update_bits(st->regmap, ADMV8818_REG_SPI_CONFIG_A, - ADMV8818_SOFTRESET_N_MSK | - ADMV8818_SOFTRESET_MSK, - FIELD_PREP(ADMV8818_SOFTRESET_N_MSK, 1) | - FIELD_PREP(ADMV8818_SOFTRESET_MSK, 1)); + ret = regmap_write(st->regmap, ADMV8818_REG_SPI_CONFIG_A, + ADMV8818_SOFTRESET_N_MSK | ADMV8818_SOFTRESET_MSK); if (ret) { dev_err(&spi->dev, "ADMV8818 Soft Reset failed.\n"); return ret; } - ret = regmap_update_bits(st->regmap, ADMV8818_REG_SPI_CONFIG_A, - ADMV8818_SDOACTIVE_N_MSK | - ADMV8818_SDOACTIVE_MSK, - FIELD_PREP(ADMV8818_SDOACTIVE_N_MSK, 1) | - FIELD_PREP(ADMV8818_SDOACTIVE_MSK, 1)); + ret = regmap_write(st->regmap, ADMV8818_REG_SPI_CONFIG_A, + ADMV8818_SDOACTIVE_N_MSK | ADMV8818_SDOACTIVE_MSK); if (ret) { dev_err(&spi->dev, "ADMV8818 SDO Enable failed.\n"); return ret; diff --git a/drivers/iio/light/apds9306.c b/drivers/iio/light/apds9306.c index 69a0d609cffc9..5ed7e17f49e76 100644 --- a/drivers/iio/light/apds9306.c +++ b/drivers/iio/light/apds9306.c @@ -108,11 +108,11 @@ static const struct part_id_gts_multiplier apds9306_gts_mul[] = { { .part_id = 0xB1, .max_scale_int = 16, - .max_scale_nano = 3264320, + .max_scale_nano = 326432000, }, { .part_id = 0xB3, .max_scale_int = 14, - .max_scale_nano = 9712000, + .max_scale_nano = 97120000, }, }; diff --git a/drivers/iio/light/hid-sensor-prox.c b/drivers/iio/light/hid-sensor-prox.c index 7ab64f5c623c1..76b76d12b3882 100644 --- a/drivers/iio/light/hid-sensor-prox.c +++ b/drivers/iio/light/hid-sensor-prox.c @@ -49,9 +49,10 @@ static const u32 prox_sensitivity_addresses[] = { #define PROX_CHANNEL(_is_proximity, _channel) \ {\ .type = _is_proximity ? IIO_PROXIMITY : IIO_ATTENTION,\ - .info_mask_separate = _is_proximity ? BIT(IIO_CHAN_INFO_RAW) :\ - BIT(IIO_CHAN_INFO_PROCESSED),\ - .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |\ + .info_mask_separate = \ + (_is_proximity ? BIT(IIO_CHAN_INFO_RAW) :\ + BIT(IIO_CHAN_INFO_PROCESSED)) |\ + BIT(IIO_CHAN_INFO_OFFSET) |\ BIT(IIO_CHAN_INFO_SCALE) |\ BIT(IIO_CHAN_INFO_SAMP_FREQ) |\ BIT(IIO_CHAN_INFO_HYSTERESIS),\ diff --git a/drivers/iio/proximity/hx9023s.c b/drivers/iio/proximity/hx9023s.c index e092a935dbac7..5aa8e5a22f326 100644 --- a/drivers/iio/proximity/hx9023s.c +++ b/drivers/iio/proximity/hx9023s.c @@ -1036,12 +1036,13 @@ static int hx9023s_send_cfg(const struct firmware *fw, struct hx9023s_data *data return -ENOMEM; memcpy(bin->data, fw->data, fw->size); - release_firmware(fw); bin->fw_size = fw->size; bin->fw_ver = bin->data[FW_VER_OFFSET]; bin->reg_count = get_unaligned_le16(bin->data + FW_REG_CNT_OFFSET); + release_firmware(fw); + return hx9023s_bin_load(data, bin); } diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c index 716c795d08fb9..82c72baccb623 100644 --- a/drivers/iio/trigger/iio-trig-hrtimer.c +++ b/drivers/iio/trigger/iio-trig-hrtimer.c @@ -145,8 +145,8 @@ static struct iio_sw_trigger *iio_trig_hrtimer_probe(const char *name) trig_info->swt.trigger->ops = &iio_hrtimer_trigger_ops; trig_info->swt.trigger->dev.groups = iio_hrtimer_attr_groups; - hrtimer_init(&trig_info->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - trig_info->timer.function = iio_hrtimer_trig_handler; + hrtimer_setup(&trig_info->timer, iio_hrtimer_trig_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); trig_info->sampling_frequency[0] = HRTIMER_DEFAULT_SAMPLING_FREQUENCY; trig_info->period = NSEC_PER_SEC / trig_info->sampling_frequency[0]; diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b91a85a491d05..3721446c6ba4b 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -187,7 +187,6 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; struct auxiliary_device *adev; - struct notifier_block nb; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 3ac47f4e61229..f039aefcaf675 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -348,8 +348,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } bnxt_re_copy_err_stats(rdev, stats, err_s); - if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags) && - !rdev->is_virtfn) { + if (bnxt_ext_stats_supported(rdev->chip_ctx, rdev->dev_attr->dev_cap_flags, + rdev->is_virtfn)) { rc = bnxt_re_get_ext_stat(rdev, stats); if (rc) { clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2de101d6e8255..6f5db32082dd7 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1870,6 +1870,8 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id; + srq->qplib_srq.sg_info.pgsize = PAGE_SIZE; + srq->qplib_srq.sg_info.pgshft = PAGE_SHIFT; nq = &rdev->nqr->nq[0]; if (udata) { diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e9e4da4dd576b..a94c8c5387d9e 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -396,11 +396,16 @@ static void bnxt_re_dcb_wq_task(struct work_struct *work) static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); struct bnxt_re_dcb_work *dcb_work; + struct bnxt_re_dev *rdev; u32 data1, data2; u16 event_id; + rdev = en_info->rdev; + if (!rdev) + return; + event_id = le16_to_cpu(cmpl->event_id); data1 = le32_to_cpu(cmpl->event_data1); data2 = le32_to_cpu(cmpl->event_data2); @@ -433,6 +438,8 @@ static void bnxt_re_stop_irq(void *handle, bool reset) int indx; rdev = en_info->rdev; + if (!rdev) + return; rcfw = &rdev->rcfw; if (reset) { @@ -461,6 +468,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) int indx, rc; rdev = en_info->rdev; + if (!rdev) + return; msix_ent = rdev->nqr->msix_entries; rcfw = &rdev->rcfw; if (!ent) { @@ -1350,7 +1359,6 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, return NULL; } /* Default values */ - rdev->nb.notifier_call = NULL; rdev->netdev = en_dev->net; rdev->en_dev = en_dev; rdev->adev = adev; @@ -2345,15 +2353,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, struct auxiliary_device *aux_dev) { - if (rdev->nb.notifier_call) { - unregister_netdevice_notifier(&rdev->nb); - rdev->nb.notifier_call = NULL; - } else { - /* If notifier is null, we should have already done a - * clean up before coming here. - */ - return; - } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); bnxt_re_dev_uninit(rdev, op_type); @@ -2433,6 +2432,7 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", __func__, en_dev->en_state); bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); + bnxt_re_update_en_info_rdev(NULL, en_info, adev); mutex_unlock(&bnxt_re_mutex); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index be5d907a036b6..711990232de1c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -547,6 +547,14 @@ static inline bool _is_ext_stats_supported(u16 dev_cap_flags) CREQ_QUERY_FUNC_RESP_SB_EXT_STATS; } +static inline int bnxt_ext_stats_supported(struct bnxt_qplib_chip_ctx *ctx, + u16 flags, bool virtfn) +{ + /* ext stats supported if cap flag is set AND is a PF OR a Thor2 VF */ + return (_is_ext_stats_supported(flags) && + ((virtfn && bnxt_qplib_is_chip_gen_p7(ctx)) || (!virtfn))); +} + static inline bool _is_hw_retx_supported(u16 dev_cap_flags) { return dev_cap_flags & diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index cbac4a442d9ec..d6fbd9c2b8b49 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -635,12 +635,11 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cca_timer_lock); for (i = 0; i < OPA_MAX_SLS; i++) { - hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); ppd->cca_timer[i].ppd = ppd; ppd->cca_timer[i].sl = i; ppd->cca_timer[i].ccti = 0; - ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dded339802b33..160e8927d364e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1286,10 +1286,8 @@ static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout) return tx_timeout; } -static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode) +static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u32 tx_timeout) { - struct hns_roce_v2_priv *priv = hr_dev->priv; - u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); u32 timeout = 0; do { @@ -1299,8 +1297,9 @@ static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode) } while (++timeout < tx_timeout); } -static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, - struct hns_roce_cmq_desc *desc, int num) +static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, + int num, u32 tx_timeout) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; @@ -1309,8 +1308,6 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, int ret; int i; - spin_lock_bh(&csq->lock); - tail = csq->head; for (i = 0; i < num; i++) { @@ -1324,22 +1321,17 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]); - hns_roce_wait_csq_done(hr_dev, le16_to_cpu(desc->opcode)); + hns_roce_wait_csq_done(hr_dev, tx_timeout); if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { /* check the result of hardware write back */ - desc[i] = csq->desc[tail++]; + desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) tail = 0; - - desc_ret = le16_to_cpu(desc[i].retval); if (likely(desc_ret == CMD_EXEC_SUCCESS)) continue; - dev_err_ratelimited(hr_dev->dev, - "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", - desc->opcode, desc_ret); ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { @@ -1354,14 +1346,54 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, ret = -EAGAIN; } - spin_unlock_bh(&csq->lock); - if (ret) atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_ERR_CNT]); return ret; } +static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, int num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + u16 opcode = le16_to_cpu(desc->opcode); + u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); + u8 try_cnt = HNS_ROCE_OPC_POST_MB_TRY_CNT; + u32 rsv_tail; + int ret; + int i; + + while (try_cnt) { + try_cnt--; + + spin_lock_bh(&csq->lock); + rsv_tail = csq->head; + ret = __hns_roce_cmq_send_one(hr_dev, desc, num, tx_timeout); + if (opcode == HNS_ROCE_OPC_POST_MB && ret == -ETIME && + try_cnt) { + spin_unlock_bh(&csq->lock); + mdelay(HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC); + continue; + } + + for (i = 0; i < num; i++) { + desc[i] = csq->desc[rsv_tail++]; + if (rsv_tail == csq->desc_num) + rsv_tail = 0; + } + spin_unlock_bh(&csq->lock); + break; + } + + if (ret) + dev_err_ratelimited(hr_dev->dev, + "Cmdq IO error, opcode = 0x%x, return = %d.\n", + opcode, ret); + + return ret; +} + static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index cbdbc9edbce6e..91a5665465ffb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -230,6 +230,8 @@ enum hns_roce_opcode_type { }; #define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 +#define HNS_ROCE_OPC_POST_MB_TRY_CNT 8 +#define HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC 5 struct hns_roce_cmdq_tx_timeout_map { u16 opcode; u32 tx_timeout; diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 67c2d43135a8a..457cea6d99095 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -174,7 +174,7 @@ static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; req.num_resources = 1; - req.alignment = 1; + req.alignment = PAGE_SIZE / MANA_PAGE_SIZE; /* Have GDMA start searching from 0 */ req.allocated_resources = 0; diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 505bc47fd575d..99036afb3aef0 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + ah->av.stat_rate_sl = + (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 4f6c1968a2ee3..81cfa74147a18 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -546,6 +546,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); + bool new = false; int err; if (!counter->id) { @@ -560,6 +561,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return err; counter->id = MLX5_GET(alloc_q_counter_out, out, counter_set_id); + new = true; } err = mlx5_ib_qp_set_counter(qp, counter); @@ -569,8 +571,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return 0; fail_set_counter: - mlx5_ib_counter_dealloc(counter); - counter->id = 0; + if (new) { + mlx5_ib_counter_dealloc(counter); + counter->id = 0; + } return err; } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bb02b6adbf2c2..753faa9ad06a8 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1550,7 +1550,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); - if (!umem_dmabuf->sgt) + if (!umem_dmabuf->sgt || !mr) return; mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); @@ -1935,7 +1935,8 @@ mlx5_alloc_priv_descs(struct ib_device *device, static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && !mr->data_direct && mr->descs) { + if (!mr->umem && !mr->data_direct && + mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -2022,11 +2023,16 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && + !to_ib_umem_dmabuf(mr->umem)->pinned; int ret = 0; if (is_odp) mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); + if (is_odp_dma_buf) + dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { ent = mr->mmkey.cache_ent; /* upon storing to a clean temp entry - schedule its cleanup */ @@ -2054,6 +2060,12 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); } + if (is_odp_dma_buf) { + if (!ret) + to_ib_umem_dmabuf(mr->umem)->private = NULL; + dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); + } + return ret; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1e23583e6c08..e77c9280c07e4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -242,6 +242,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != mr) { xa_unlock(&imr->implicit_children); + mlx5r_deref_odp_mkey(&imr->mmkey); return; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a43eba9d3572c..88724d15705d4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3447,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate) return 0; } -static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) return 0; if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) @@ -3596,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, sizeof(grh->dgid.raw)); } - err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; MLX5_SET(ads, path, stat_rate, err); @@ -4579,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); + + qp->port = attr->port_num; } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; @@ -5074,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, } if (qp_attr_mask & IB_QP_PORT) - qp_attr->port_num = MLX5_GET(dctc, dctc, port); + qp_attr->port_num = mqp->port; if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); if (qp_attr_mask & IB_QP_AV) { diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index b6ee7c3ee1ca1..2530e7730635f 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); int mlx5_ib_qp_event_init(void); void mlx5_ib_qp_event_cleanup(void); +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate); #endif /* _MLX5_IB_QP_H */ diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 887fd6fa3ba93..793f3c5c4d012 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -231,30 +231,6 @@ void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) ib_dealloc_pd(dev->umrc.pd); } -static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) -{ - struct umr_common *umrc = &dev->umrc; - struct ib_qp_attr attr; - int err; - - attr.qp_state = IB_QPS_RESET; - err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); - if (err) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto err; - } - - err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); - if (err) - goto err; - - umrc->state = MLX5_UMR_STATE_ACTIVE; - return 0; - -err: - umrc->state = MLX5_UMR_STATE_ERR; - return err; -} static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) @@ -302,6 +278,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, return err; } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_context *umr_context, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + mutex_lock(&umrc->lock); + /* Preventing any further WRs to be sent now */ + if (umrc->state != MLX5_UMR_STATE_RECOVER) { + mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", + umrc->state); + umrc->state = MLX5_UMR_STATE_RECOVER; + } + mutex_unlock(&umrc->lock); + + /* Sending a final/barrier WR (the failed one) and wait for its completion. + * This will ensure that all the previous WRs got a completion before + * we set the QP state to RESET. + */ + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, + with_data); + if (err) { + mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); + goto err; + } + + /* Since the QP is in an error state, it will only receive + * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier + * we don't care about its status. + */ + wait_for_completion(&umr_context->done); + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); + goto err; + } + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_umr_context *context = @@ -366,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5_ib_warn(dev, "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", umr_context.status, mkey); - mutex_lock(&umrc->lock); - err = mlx5r_umr_recover(dev); - mutex_unlock(&umrc->lock); + err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); if (err) mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", err); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index e6203e26cc064..614009fb96329 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1107,9 +1107,8 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, } /* initialize timers needed for rc qp */ timer_setup(&qp->s_timer, rvt_rc_timeout, 0); - hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - qp->s_rnr_timer.function = rvt_rc_rnr_retry; + hrtimer_setup(&qp->s_rnr_timer, rvt_rc_rnr_retry, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Driver needs to set up it's private QP structure and do any diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b48a72bd7b23d..cd5116d8c3b28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2043,12 +2043,12 @@ static void set_dte_entry(struct amd_iommu *iommu, make_clear_dte(dev_data, dte, &new); if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] = iommu_virt_to_phys(domain->iop.root); + new.data[0] |= iommu_virt_to_phys(domain->iop.root); new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; - new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; /* * When SNP is enabled, we can only support TV=1 with non-zero domain ID. diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 9f424acf474e9..e540092d664d2 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2043,6 +2043,7 @@ int enable_drhd_fault_handling(unsigned int cpu) /* * Enable fault control interrupt. */ + guard(rwsem_read)(&dmar_global_lock); for_each_iommu(iommu, drhd) { u32 fault_status; int ret; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cc46098f875b1..bf1f0c8143483 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3146,7 +3146,14 @@ int __init intel_iommu_init(void) iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); iommu_pmu_register(iommu); } @@ -4378,9 +4385,6 @@ static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void * { struct device *dev = data; - if (dev != &pdev->dev) - return 0; - return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); } diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad9..083fa9578bb7e 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -590,13 +590,7 @@ config RISCV_IMSIC select IRQ_DOMAIN_HIERARCHY select GENERIC_IRQ_MATRIX_ALLOCATOR select GENERIC_MSI_IRQ - -config RISCV_IMSIC_PCI - bool - depends on RISCV_IMSIC - depends on PCI - depends on PCI_MSI - default RISCV_IMSIC + select IRQ_MSI_LIB config SIFIVE_PLIC bool @@ -752,6 +746,18 @@ config MCHP_EIC help Support for Microchip External Interrupt Controller. +config SOPHGO_SG2042_MSI + bool "Sophgo SG2042 MSI Controller" + depends on ARCH_SOPHGO || COMPILE_TEST + depends on PCI + select IRQ_DOMAIN_HIERARCHY + select IRQ_MSI_LIB + select PCI_MSI + help + Support for the Sophgo SG2042 MSI Controller. + This on-chip interrupt controller enables MSI sources to be + routed to the primary PLIC controller on SoC. + config SUNPLUS_SP7021_INTC bool "Sunplus SP7021 interrupt controller" if COMPILE_TEST default SOC_SP7021 diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 25e9ad29b8c4a..dd60e597491d9 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -128,4 +128,5 @@ obj-$(CONFIG_WPCM450_AIC) += irq-wpcm450-aic.o obj-$(CONFIG_IRQ_IDT3243X) += irq-idt3243x.o obj-$(CONFIG_APPLE_AIC) += irq-apple-aic.o obj-$(CONFIG_MCHP_EIC) += irq-mchp-eic.o +obj-$(CONFIG_SOPHGO_SG2042_MSI) += irq-sg2042-msi.o obj-$(CONFIG_SUNPLUS_SP7021_INTC) += irq-sp7021-intc.o diff --git a/drivers/irqchip/irq-davinci-cp-intc.c b/drivers/irqchip/irq-davinci-cp-intc.c index f4f8e9fadbbfb..d7948c55f5422 100644 --- a/drivers/irqchip/irq-davinci-cp-intc.c +++ b/drivers/irqchip/irq-davinci-cp-intc.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -154,24 +153,20 @@ static const struct irq_domain_ops davinci_cp_intc_irq_domain_ops = { .xlate = irq_domain_xlate_onetwocell, }; -static int __init -davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, - struct device_node *node) +static int __init davinci_cp_intc_do_init(struct resource *res, unsigned int num_irqs, + struct device_node *node) { - unsigned int num_regs = BITS_TO_LONGS(config->num_irqs); + unsigned int num_regs = BITS_TO_LONGS(num_irqs); int offset, irq_base; void __iomem *req; - req = request_mem_region(config->reg.start, - resource_size(&config->reg), - "davinci-cp-intc"); + req = request_mem_region(res->start, resource_size(res), "davinci-cp-intc"); if (!req) { pr_err("%s: register range busy\n", __func__); return -EBUSY; } - davinci_cp_intc_base = ioremap(config->reg.start, - resource_size(&config->reg)); + davinci_cp_intc_base = ioremap(res->start, resource_size(res)); if (!davinci_cp_intc_base) { pr_err("%s: unable to ioremap register range\n", __func__); return -EINVAL; @@ -184,8 +179,7 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, /* Disable system interrupts */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(~0, - DAVINCI_CP_INTC_SYS_ENABLE_CLR(offset)); + davinci_cp_intc_write(~0, DAVINCI_CP_INTC_SYS_ENABLE_CLR(offset)); /* Set to normal mode, no nesting, no priority hold */ davinci_cp_intc_write(0, DAVINCI_CP_INTC_CTRL); @@ -193,28 +187,25 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, /* Clear system interrupt status */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(~0, - DAVINCI_CP_INTC_SYS_STAT_CLR(offset)); + davinci_cp_intc_write(~0, DAVINCI_CP_INTC_SYS_STAT_CLR(offset)); /* Enable nIRQ (what about nFIQ?) */ davinci_cp_intc_write(1, DAVINCI_CP_INTC_HOST_ENABLE_IDX_SET); + /* 4 channels per register */ + num_regs = (num_irqs + 3) >> 2; /* Default all priorities to channel 7. */ - num_regs = (config->num_irqs + 3) >> 2; /* 4 channels per register */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(0x07070707, - DAVINCI_CP_INTC_CHAN_MAP(offset)); + davinci_cp_intc_write(0x07070707, DAVINCI_CP_INTC_CHAN_MAP(offset)); - irq_base = irq_alloc_descs(-1, 0, config->num_irqs, 0); + irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); if (irq_base < 0) { - pr_err("%s: unable to allocate interrupt descriptors: %d\n", - __func__, irq_base); + pr_err("%s: unable to allocate interrupt descriptors: %d\n", __func__, irq_base); return irq_base; } - davinci_cp_intc_irq_domain = irq_domain_add_legacy( - node, config->num_irqs, irq_base, 0, - &davinci_cp_intc_irq_domain_ops, NULL); + davinci_cp_intc_irq_domain = irq_domain_add_legacy(node, num_irqs, irq_base, 0, + &davinci_cp_intc_irq_domain_ops, NULL); if (!davinci_cp_intc_irq_domain) { pr_err("%s: unable to create an interrupt domain\n", __func__); @@ -229,31 +220,25 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, return 0; } -int __init davinci_cp_intc_init(const struct davinci_cp_intc_config *config) -{ - return davinci_cp_intc_do_init(config, NULL); -} - static int __init davinci_cp_intc_of_init(struct device_node *node, struct device_node *parent) { - struct davinci_cp_intc_config config = { }; + unsigned int num_irqs; + struct resource res; int ret; - ret = of_address_to_resource(node, 0, &config.reg); + ret = of_address_to_resource(node, 0, &res); if (ret) { - pr_err("%s: unable to get the register range from device-tree\n", - __func__); + pr_err("%s: unable to get the register range from device-tree\n", __func__); return ret; } - ret = of_property_read_u32(node, "ti,intc-size", &config.num_irqs); + ret = of_property_read_u32(node, "ti,intc-size", &num_irqs); if (ret) { - pr_err("%s: unable to read the 'ti,intc-size' property\n", - __func__); + pr_err("%s: unable to read the 'ti,intc-size' property\n", __func__); return ret; } - return davinci_cp_intc_do_init(&config, node); + return davinci_cp_intc_do_init(&res, num_irqs, node); } IRQCHIP_DECLARE(cp_intc, "ti,cp-intc", davinci_cp_intc_of_init); diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index be35c5349986a..1e3476c335ca8 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -255,6 +255,7 @@ static void __init gicv2m_teardown(void) static struct msi_parent_ops gicv2m_msi_parent_ops = { .supported_flags = GICV2M_MSI_FLAGS_SUPPORTED, .required_flags = GICV2M_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "GICv2m-", diff --git a/drivers/irqchip/irq-gic-v3-its-msi-parent.c b/drivers/irqchip/irq-gic-v3-its-msi-parent.c index e150365fbe892..bdb04c8081480 100644 --- a/drivers/irqchip/irq-gic-v3-its-msi-parent.c +++ b/drivers/irqchip/irq-gic-v3-its-msi-parent.c @@ -203,6 +203,7 @@ static bool its_init_dev_msi_info(struct device *dev, struct irq_domain *domain, const struct msi_parent_ops gic_v3_its_msi_parent_ops = { .supported_flags = ITS_MSI_FLAGS_SUPPORTED, .required_flags = ITS_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "ITS-", diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 8c3ec5734f1ef..f30ed281882ff 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -205,13 +205,15 @@ static DEFINE_IDA(its_vpeid_ida); #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) +static gfp_t gfp_flags_quirk; + static struct page *its_alloc_pages_node(int node, gfp_t gfp, unsigned int order) { struct page *page; int ret = 0; - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_node(node, gfp | gfp_flags_quirk, order); if (!page) return NULL; @@ -4887,6 +4889,17 @@ static bool __maybe_unused its_enable_quirk_hip09_162100801(void *data) return true; } +static bool __maybe_unused its_enable_rk3568002(void *data) +{ + if (!of_machine_is_compatible("rockchip,rk3566") && + !of_machine_is_compatible("rockchip,rk3568")) + return false; + + gfp_flags_quirk |= GFP_DMA32; + + return true; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -4954,6 +4967,14 @@ static const struct gic_quirk its_quirks[] = { .property = "dma-noncoherent", .init = its_set_non_coherent, }, +#ifdef CONFIG_ROCKCHIP_ERRATUM_3568002 + { + .desc = "ITS: Rockchip erratum RK3568002", + .iidr = 0x0201743b, + .mask = 0xffffffff, + .init = its_enable_rk3568002, + }, +#endif { } }; diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee174..3e1d8a1cda5e6 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -201,6 +201,7 @@ static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, static const struct msi_parent_ops gic_v3_mbi_msi_parent_ops = { .supported_flags = MBI_MSI_FLAGS_SUPPORTED, .required_flags = MBI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "MBI-", diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 76dce0aac2465..270d7a4d85a6d 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -44,6 +44,7 @@ static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI; #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) #define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1) #define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2) +#define FLAGS_WORKAROUND_INSECURE (1ULL << 3) #define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) @@ -83,6 +84,8 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); #define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) #define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer) +static bool nmi_support_forbidden; + /* * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs * are potentially stolen by the secure side. Some code, especially code dealing @@ -163,21 +166,27 @@ static void __init gic_prio_init(void) { bool ds; - ds = gic_dist_security_disabled(); - if (!ds) { - u32 val; - - val = readl_relaxed(gic_data.dist_base + GICD_CTLR); - val |= GICD_CTLR_DS; - writel_relaxed(val, gic_data.dist_base + GICD_CTLR); + cpus_have_group0 = gic_has_group0(); - ds = gic_dist_security_disabled(); - if (ds) - pr_warn("Broken GIC integration, security disabled"); + ds = gic_dist_security_disabled(); + if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) { + if (cpus_have_group0) { + u32 val; + + val = readl_relaxed(gic_data.dist_base + GICD_CTLR); + val |= GICD_CTLR_DS; + writel_relaxed(val, gic_data.dist_base + GICD_CTLR); + + ds = gic_dist_security_disabled(); + if (ds) + pr_warn("Broken GIC integration, security disabled\n"); + } else { + pr_warn("Broken GIC integration, pNMI forbidden\n"); + nmi_support_forbidden = true; + } } cpus_have_security_disabled = ds; - cpus_have_group0 = gic_has_group0(); /* * How priority values are used by the GIC depends on two things: @@ -209,7 +218,7 @@ static void __init gic_prio_init(void) * be in the non-secure range, we program the non-secure values into * the distributor to match the PMR values we want. */ - if (cpus_have_group0 & !cpus_have_security_disabled) { + if (cpus_have_group0 && !cpus_have_security_disabled) { dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq); dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi); } @@ -1922,6 +1931,18 @@ static bool gic_enable_quirk_arm64_2941627(void *data) return true; } +static bool gic_enable_quirk_rk3399(void *data) +{ + struct gic_chip_data *d = data; + + if (of_machine_is_compatible("rockchip,rk3399")) { + d->flags |= FLAGS_WORKAROUND_INSECURE; + return true; + } + + return false; +} + static bool rd_set_non_coherent(void *data) { struct gic_chip_data *d = data; @@ -1996,6 +2017,12 @@ static const struct gic_quirk gic_quirks[] = { .property = "dma-noncoherent", .init = rd_set_non_coherent, }, + { + .desc = "GICv3: Insecure RK3399 integration", + .iidr = 0x0000043b, + .mask = 0xff000fff, + .init = gic_enable_quirk_rk3399, + }, { } }; @@ -2004,7 +2031,7 @@ static void gic_enable_nmi_support(void) { int i; - if (!gic_prio_masking_enabled()) + if (!gic_prio_masking_enabled() || nmi_support_forbidden) return; rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR, diff --git a/drivers/irqchip/irq-imx-irqsteer.c b/drivers/irqchip/irq-imx-irqsteer.c index b0e9788c00454..afbfcce3b1e3b 100644 --- a/drivers/irqchip/irq-imx-irqsteer.c +++ b/drivers/irqchip/irq-imx-irqsteer.c @@ -24,7 +24,7 @@ #define CHAN_MINTDIS(t) (CTRL_STRIDE_OFF(t, 3) + 0x4) #define CHAN_MASTRSTAT(t) (CTRL_STRIDE_OFF(t, 3) + 0x8) -#define CHAN_MAX_OUTPUT_INT 0x8 +#define CHAN_MAX_OUTPUT_INT 0xF struct irqsteer_data { void __iomem *regs; @@ -228,10 +228,8 @@ static int imx_irqsteer_probe(struct platform_device *pdev) for (i = 0; i < data->irq_count; i++) { data->irq[i] = irq_of_parse_and_map(np, i); - if (!data->irq[i]) { - ret = -EINVAL; - goto out; - } + if (!data->irq[i]) + break; irq_set_chained_handler_and_data(data->irq[i], imx_irqsteer_irq_handler, @@ -254,9 +252,13 @@ static void imx_irqsteer_remove(struct platform_device *pdev) struct irqsteer_data *irqsteer_data = platform_get_drvdata(pdev); int i; - for (i = 0; i < irqsteer_data->irq_count; i++) + for (i = 0; i < irqsteer_data->irq_count; i++) { + if (!irqsteer_data->irq[i]) + break; + irq_set_chained_handler_and_data(irqsteer_data->irq[i], NULL, NULL); + } irq_domain_remove(irqsteer_data->domain); diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c index 4342a21de1eb0..69aacdfc8bef0 100644 --- a/drivers/irqchip/irq-imx-mu-msi.c +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -214,6 +214,7 @@ static void imx_mu_msi_irq_handler(struct irq_desc *desc) static const struct msi_parent_ops imx_mu_msi_parent_ops = { .supported_flags = IMX_MU_MSI_FLAGS_SUPPORTED, .required_flags = IMX_MU_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "MU-MSI-", diff --git a/drivers/irqchip/irq-jcore-aic.c b/drivers/irqchip/irq-jcore-aic.c index b9dcc8e78c750..1f613eb7b7f03 100644 --- a/drivers/irqchip/irq-jcore-aic.c +++ b/drivers/irqchip/irq-jcore-aic.c @@ -38,7 +38,7 @@ static struct irq_chip jcore_aic; static void handle_jcore_irq(struct irq_desc *desc) { if (irqd_is_per_cpu(irq_desc_get_irq_data(desc))) - handle_percpu_irq(desc); + handle_percpu_devid_irq(desc); else handle_simple_irq(desc); } diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c index bd337ecddb409..9c62108b3ad58 100644 --- a/drivers/irqchip/irq-loongson-pch-msi.c +++ b/drivers/irqchip/irq-loongson-pch-msi.c @@ -146,6 +146,7 @@ static const struct irq_domain_ops pch_msi_middle_domain_ops = { static struct msi_parent_ops pch_msi_parent_ops = { .required_flags = PCH_MSI_FLAGS_REQUIRED, .supported_flags = PCH_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_mask = MATCH_PCI_MSI, .bus_select_token = DOMAIN_BUS_NEXUS, .prefix = "PCH-", diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index d8e29fc0d4068..51464c6257f37 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -28,6 +28,7 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, struct msi_domain_info *info) { const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + struct irq_chip *chip = info->chip; u32 required_flags; /* Parent ops available? */ @@ -92,10 +93,10 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, info->flags |= required_flags; /* Chip updates for all child bus types */ - if (!info->chip->irq_eoi) - info->chip->irq_eoi = irq_chip_eoi_parent; - if (!info->chip->irq_ack) - info->chip->irq_ack = irq_chip_ack_parent; + if (!chip->irq_eoi && (pops->chip_flags & MSI_CHIP_FLAG_SET_EOI)) + chip->irq_eoi = irq_chip_eoi_parent; + if (!chip->irq_ack && (pops->chip_flags & MSI_CHIP_FLAG_SET_ACK)) + chip->irq_ack = irq_chip_ack_parent; /* * The device MSI domain can never have a set affinity callback. It @@ -105,7 +106,7 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, * device MSI domain aside of mask/unmask which is provided e.g. by * PCI/MSI device domains. */ - info->chip->irq_set_affinity = msi_domain_set_affinity; + chip->irq_set_affinity = msi_domain_set_affinity; return true; } EXPORT_SYMBOL_GPL(msi_lib_init_dev_msi_info); diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c index 2b6183919ea48..d67f93f6d7505 100644 --- a/drivers/irqchip/irq-mvebu-gicp.c +++ b/drivers/irqchip/irq-mvebu-gicp.c @@ -161,6 +161,7 @@ static const struct irq_domain_ops gicp_domain_ops = { static const struct msi_parent_ops gicp_msi_parent_ops = { .supported_flags = GICP_MSI_FLAGS_SUPPORTED, .required_flags = GICP_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "GICP-", diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c index ff19bfd258dce..28f7e81df94f0 100644 --- a/drivers/irqchip/irq-mvebu-odmi.c +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -157,6 +157,7 @@ static const struct irq_domain_ops odmi_domain_ops = { static const struct msi_parent_ops odmi_msi_parent_ops = { .supported_flags = ODMI_MSI_FLAGS_SUPPORTED, .required_flags = ODMI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "ODMI-", diff --git a/drivers/irqchip/irq-mvebu-sei.c b/drivers/irqchip/irq-mvebu-sei.c index 065166ab5dbc0..ebd4a9014e8da 100644 --- a/drivers/irqchip/irq-mvebu-sei.c +++ b/drivers/irqchip/irq-mvebu-sei.c @@ -356,6 +356,7 @@ static void mvebu_sei_reset(struct mvebu_sei *sei) static const struct msi_parent_ops sei_msi_parent_ops = { .supported_flags = SEI_MSI_FLAGS_SUPPORTED, .required_flags = SEI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_mask = MATCH_PLATFORM_MSI, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .prefix = "SEI-", diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 99e27e01b0b19..6a2e41f024464 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -541,43 +541,36 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * return -ENODEV; parent_domain = irq_find_host(parent); - if (!parent_domain) { - dev_err(&pdev->dev, "cannot find parent domain\n"); - return -ENODEV; - } + if (!parent_domain) + return dev_err_probe(dev, -ENODEV, "cannot find parent domain\n"); - rzg2l_irqc_data = devm_kzalloc(&pdev->dev, sizeof(*rzg2l_irqc_data), GFP_KERNEL); + rzg2l_irqc_data = devm_kzalloc(dev, sizeof(*rzg2l_irqc_data), GFP_KERNEL); if (!rzg2l_irqc_data) return -ENOMEM; rzg2l_irqc_data->irqchip = irq_chip; - rzg2l_irqc_data->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); + rzg2l_irqc_data->base = devm_of_iomap(dev, dev->of_node, 0, NULL); if (IS_ERR(rzg2l_irqc_data->base)) return PTR_ERR(rzg2l_irqc_data->base); ret = rzg2l_irqc_parse_interrupts(rzg2l_irqc_data, node); - if (ret) { - dev_err(&pdev->dev, "cannot parse interrupts: %d\n", ret); - return ret; - } - - resetn = devm_reset_control_get_exclusive(&pdev->dev, NULL); - if (IS_ERR(resetn)) - return PTR_ERR(resetn); + if (ret) + return dev_err_probe(dev, ret, "cannot parse interrupts: %d\n", ret); - ret = reset_control_deassert(resetn); - if (ret) { - dev_err(&pdev->dev, "failed to deassert resetn pin, %d\n", ret); - return ret; + resetn = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(resetn)) { + return dev_err_probe(dev, PTR_ERR(resetn), + "failed to acquire deasserted reset: %d\n", ret); } - pm_runtime_enable(&pdev->dev); - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret < 0) { - dev_err(&pdev->dev, "pm_runtime_resume_and_get failed: %d\n", ret); - goto pm_disable; - } + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "devm_pm_runtime_enable failed: %d\n", ret); + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return dev_err_probe(dev, ret, "pm_runtime_resume_and_get failed: %d\n", ret); raw_spin_lock_init(&rzg2l_irqc_data->lock); @@ -585,9 +578,8 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * node, &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { - dev_err(&pdev->dev, "failed to add irq domain\n"); - ret = -ENOMEM; - goto pm_put; + pm_runtime_put(dev); + return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); } register_syscore_ops(&rzg2l_irqc_syscore_ops); @@ -604,13 +596,6 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * dev = NULL; return 0; - -pm_put: - pm_runtime_put(&pdev->dev); -pm_disable: - pm_runtime_disable(&pdev->dev); - reset_control_assert(resetn); - return ret; } static int __init rzg2l_irqc_init(struct device_node *node, diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index fe2d29e910261..3d5b5fdf9bde8 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -64,11 +64,18 @@ #define ICU_TINT_LEVEL_HIGH 2 #define ICU_TINT_LEVEL_LOW 3 -#define ICU_TSSR_K(tint_nr) ((tint_nr) / 4) -#define ICU_TSSR_TSSEL_N(tint_nr) ((tint_nr) % 4) -#define ICU_TSSR_TSSEL_PREP(tssel, n) ((tssel) << ((n) * 8)) -#define ICU_TSSR_TSSEL_MASK(n) ICU_TSSR_TSSEL_PREP(0x7F, n) -#define ICU_TSSR_TIEN(n) (BIT(7) << ((n) * 8)) +#define ICU_TSSR_TSSEL_PREP(tssel, n, field_width) ((tssel) << ((n) * (field_width))) +#define ICU_TSSR_TSSEL_MASK(n, field_width) \ +({\ + typeof(field_width) (_field_width) = (field_width); \ + ICU_TSSR_TSSEL_PREP((GENMASK(((_field_width) - 2), 0)), (n), _field_width); \ +}) + +#define ICU_TSSR_TIEN(n, field_width) \ +({\ + typeof(field_width) (_field_width) = (field_width); \ + BIT((_field_width) - 1) << ((n) * (_field_width)); \ +}) #define ICU_TITSR_K(tint_nr) ((tint_nr) / 16) #define ICU_TITSR_TITSEL_N(tint_nr) ((tint_nr) % 16) @@ -78,20 +85,36 @@ #define ICU_TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) #define ICU_TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) -#define ICU_PB5_TINT 0x55 +#define ICU_RZG3E_TINT_OFFSET 0x800 +#define ICU_RZG3E_TSSEL_MAX_VAL 0x8c +#define ICU_RZV2H_TSSEL_MAX_VAL 0x55 + +/** + * struct rzv2h_hw_info - Interrupt Control Unit controller hardware info structure. + * @tssel_lut: TINT lookup table + * @t_offs: TINT offset + * @max_tssel: TSSEL max value + * @field_width: TSSR field width + */ +struct rzv2h_hw_info { + const u8 *tssel_lut; + u16 t_offs; + u8 max_tssel; + u8 field_width; +}; /** * struct rzv2h_icu_priv - Interrupt Control Unit controller private data structure. * @base: Controller's base address - * @irqchip: Pointer to struct irq_chip * @fwspec: IRQ firmware specific data * @lock: Lock to serialize access to hardware registers + * @info: Pointer to struct rzv2h_hw_info */ struct rzv2h_icu_priv { void __iomem *base; - const struct irq_chip *irqchip; struct irq_fwspec fwspec[ICU_NUM_IRQ]; raw_spinlock_t lock; + const struct rzv2h_hw_info *info; }; static inline struct rzv2h_icu_priv *irq_data_to_priv(struct irq_data *data) @@ -111,7 +134,7 @@ static void rzv2h_icu_eoi(struct irq_data *d) tintirq_nr = hw_irq - ICU_TINT_START; bit = BIT(tintirq_nr); if (!irqd_is_level_type(d)) - writel_relaxed(bit, priv->base + ICU_TSCLR); + writel_relaxed(bit, priv->base + priv->info->t_offs + ICU_TSCLR); } else if (hw_irq >= ICU_IRQ_START) { tintirq_nr = hw_irq - ICU_IRQ_START; bit = BIT(tintirq_nr); @@ -130,21 +153,23 @@ static void rzv2h_tint_irq_endisable(struct irq_data *d, bool enable) struct rzv2h_icu_priv *priv = irq_data_to_priv(d); unsigned int hw_irq = irqd_to_hwirq(d); u32 tint_nr, tssel_n, k, tssr; + u8 nr_tint; if (hw_irq < ICU_TINT_START) return; tint_nr = hw_irq - ICU_TINT_START; - k = ICU_TSSR_K(tint_nr); - tssel_n = ICU_TSSR_TSSEL_N(tint_nr); + nr_tint = 32 / priv->info->field_width; + k = tint_nr / nr_tint; + tssel_n = tint_nr % nr_tint; guard(raw_spinlock)(&priv->lock); - tssr = readl_relaxed(priv->base + ICU_TSSR(k)); + tssr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSSR(k)); if (enable) - tssr |= ICU_TSSR_TIEN(tssel_n); + tssr |= ICU_TSSR_TIEN(tssel_n, priv->info->field_width); else - tssr &= ~ICU_TSSR_TIEN(tssel_n); - writel_relaxed(tssr, priv->base + ICU_TSSR(k)); + tssr &= ~ICU_TSSR_TIEN(tssel_n, priv->info->field_width); + writel_relaxed(tssr, priv->base + priv->info->t_offs + ICU_TSSR(k)); } static void rzv2h_icu_irq_disable(struct irq_data *d) @@ -247,8 +272,8 @@ static void rzv2h_clear_tint_int(struct rzv2h_icu_priv *priv, unsigned int hwirq u32 bit = BIT(tint_nr); int k = tint_nr / 16; - tsctr = readl_relaxed(priv->base + ICU_TSCTR); - titsr = readl_relaxed(priv->base + ICU_TITSR(k)); + tsctr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSCTR); + titsr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TITSR(k)); titsel = ICU_TITSR_TITSEL_GET(titsr, titsel_n); /* @@ -257,7 +282,7 @@ static void rzv2h_clear_tint_int(struct rzv2h_icu_priv *priv, unsigned int hwirq */ if ((tsctr & bit) && ((titsel == ICU_TINT_EDGE_RISING) || (titsel == ICU_TINT_EDGE_FALLING))) - writel_relaxed(bit, priv->base + ICU_TSCLR); + writel_relaxed(bit, priv->base + priv->info->t_offs + ICU_TSCLR); } static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) @@ -268,6 +293,7 @@ static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) unsigned int hwirq; u32 tint, sense; int tint_nr; + u8 nr_tint; switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_LEVEL_LOW: @@ -290,39 +316,42 @@ static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } + priv = irq_data_to_priv(d); tint = (u32)(uintptr_t)irq_data_get_irq_chip_data(d); - if (tint > ICU_PB5_TINT) + if (tint > priv->info->max_tssel) return -EINVAL; - priv = irq_data_to_priv(d); - hwirq = irqd_to_hwirq(d); + if (priv->info->tssel_lut) + tint = priv->info->tssel_lut[tint]; + hwirq = irqd_to_hwirq(d); tint_nr = hwirq - ICU_TINT_START; - tssr_k = ICU_TSSR_K(tint_nr); - tssel_n = ICU_TSSR_TSSEL_N(tint_nr); + nr_tint = 32 / priv->info->field_width; + tssr_k = tint_nr / nr_tint; + tssel_n = tint_nr % nr_tint; + tien = ICU_TSSR_TIEN(tssel_n, priv->info->field_width); titsr_k = ICU_TITSR_K(tint_nr); titsel_n = ICU_TITSR_TITSEL_N(tint_nr); - tien = ICU_TSSR_TIEN(titsel_n); guard(raw_spinlock)(&priv->lock); - tssr = readl_relaxed(priv->base + ICU_TSSR(tssr_k)); - tssr &= ~(ICU_TSSR_TSSEL_MASK(tssel_n) | tien); - tssr |= ICU_TSSR_TSSEL_PREP(tint, tssel_n); + tssr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); + tssr &= ~(ICU_TSSR_TSSEL_MASK(tssel_n, priv->info->field_width) | tien); + tssr |= ICU_TSSR_TSSEL_PREP(tint, tssel_n, priv->info->field_width); - writel_relaxed(tssr, priv->base + ICU_TSSR(tssr_k)); + writel_relaxed(tssr, priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); - titsr = readl_relaxed(priv->base + ICU_TITSR(titsr_k)); + titsr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TITSR(titsr_k)); titsr &= ~ICU_TITSR_TITSEL_MASK(titsel_n); titsr |= ICU_TITSR_TITSEL_PREP(sense, titsel_n); - writel_relaxed(titsr, priv->base + ICU_TITSR(titsr_k)); + writel_relaxed(titsr, priv->base + priv->info->t_offs + ICU_TITSR(titsr_k)); rzv2h_clear_tint_int(priv, hwirq); - writel_relaxed(tssr | tien, priv->base + ICU_TSSR(tssr_k)); + writel_relaxed(tssr | tien, priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); return 0; } @@ -390,7 +419,7 @@ static int rzv2h_icu_alloc(struct irq_domain *domain, unsigned int virq, unsigne if (hwirq > (ICU_NUM_IRQ - 1)) return -EINVAL; - ret = irq_domain_set_hwirq_and_chip(domain, virq, hwirq, priv->irqchip, + ret = irq_domain_set_hwirq_and_chip(domain, virq, hwirq, &rzv2h_icu_chip, (void *)(uintptr_t)tint); if (ret) return ret; @@ -421,7 +450,13 @@ static int rzv2h_icu_parse_interrupts(struct rzv2h_icu_priv *priv, struct device return 0; } -static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +static void rzv2h_icu_put_device(void *data) +{ + put_device(data); +} + +static int rzv2h_icu_init_common(struct device_node *node, struct device_node *parent, + const struct rzv2h_hw_info *hw_info) { struct irq_domain *irq_domain, *parent_domain; struct rzv2h_icu_priv *rzv2h_icu_data; @@ -433,50 +468,48 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) if (!pdev) return -ENODEV; + ret = devm_add_action_or_reset(&pdev->dev, rzv2h_icu_put_device, + &pdev->dev); + if (ret < 0) + return ret; + parent_domain = irq_find_host(parent); if (!parent_domain) { dev_err(&pdev->dev, "cannot find parent domain\n"); - ret = -ENODEV; - goto put_dev; + return -ENODEV; } rzv2h_icu_data = devm_kzalloc(&pdev->dev, sizeof(*rzv2h_icu_data), GFP_KERNEL); - if (!rzv2h_icu_data) { - ret = -ENOMEM; - goto put_dev; - } - - rzv2h_icu_data->irqchip = &rzv2h_icu_chip; + if (!rzv2h_icu_data) + return -ENOMEM; rzv2h_icu_data->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); - if (IS_ERR(rzv2h_icu_data->base)) { - ret = PTR_ERR(rzv2h_icu_data->base); - goto put_dev; - } + if (IS_ERR(rzv2h_icu_data->base)) + return PTR_ERR(rzv2h_icu_data->base); ret = rzv2h_icu_parse_interrupts(rzv2h_icu_data, node); if (ret) { dev_err(&pdev->dev, "cannot parse interrupts: %d\n", ret); - goto put_dev; + return ret; } - resetn = devm_reset_control_get_exclusive(&pdev->dev, NULL); + resetn = devm_reset_control_get_exclusive_deasserted(&pdev->dev, NULL); if (IS_ERR(resetn)) { ret = PTR_ERR(resetn); - goto put_dev; + dev_err(&pdev->dev, "failed to acquire deasserted reset: %d\n", ret); + return ret; } - ret = reset_control_deassert(resetn); - if (ret) { - dev_err(&pdev->dev, "failed to deassert resetn pin, %d\n", ret); - goto put_dev; + ret = devm_pm_runtime_enable(&pdev->dev); + if (ret < 0) { + dev_err(&pdev->dev, "devm_pm_runtime_enable failed, %d\n", ret); + return ret; } - pm_runtime_enable(&pdev->dev); ret = pm_runtime_resume_and_get(&pdev->dev); if (ret < 0) { dev_err(&pdev->dev, "pm_runtime_resume_and_get failed: %d\n", ret); - goto pm_disable; + return ret; } raw_spin_lock_init(&rzv2h_icu_data->lock); @@ -489,6 +522,8 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) goto pm_put; } + rzv2h_icu_data->info = hw_info; + /* * coccicheck complains about a missing put_device call before returning, but it's a false * positive. We still need &pdev->dev after successfully returning from this function. @@ -497,16 +532,61 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) pm_put: pm_runtime_put(&pdev->dev); -pm_disable: - pm_runtime_disable(&pdev->dev); - reset_control_assert(resetn); -put_dev: - put_device(&pdev->dev); return ret; } +/* Mapping based on port index on Table 4.2-6 and TSSEL bits on Table 4.6-4 */ +static const u8 rzg3e_tssel_lut[] = { + 81, 82, 83, 84, 85, 86, 87, 88, /* P00-P07 */ + 89, 90, 91, 92, 93, 94, 95, 96, /* P10-P17 */ + 111, 112, /* P20-P21 */ + 97, 98, 99, 100, 101, 102, 103, 104, /* P30-P37 */ + 105, 106, 107, 108, 109, 110, /* P40-P45 */ + 113, 114, 115, 116, 117, 118, 119, /* P50-P56 */ + 120, 121, 122, 123, 124, 125, 126, /* P60-P66 */ + 127, 128, 129, 130, 131, 132, 133, 134, /* P70-P77 */ + 135, 136, 137, 138, 139, 140, /* P80-P85 */ + 43, 44, 45, 46, 47, 48, 49, 50, /* PA0-PA7 */ + 51, 52, 53, 54, 55, 56, 57, 58, /* PB0-PB7 */ + 59, 60, 61, /* PC0-PC2 */ + 62, 63, 64, 65, 66, 67, 68, 69, /* PD0-PD7 */ + 70, 71, 72, 73, 74, 75, 76, 77, /* PE0-PE7 */ + 78, 79, 80, /* PF0-PF2 */ + 25, 26, 27, 28, 29, 30, 31, 32, /* PG0-PG7 */ + 33, 34, 35, 36, 37, 38, /* PH0-PH5 */ + 4, 5, 6, 7, 8, /* PJ0-PJ4 */ + 39, 40, 41, 42, /* PK0-PK3 */ + 9, 10, 11, 12, 21, 22, 23, 24, /* PL0-PL7 */ + 13, 14, 15, 16, 17, 18, 19, 20, /* PM0-PM7 */ + 0, 1, 2, 3 /* PS0-PS3 */ +}; + +static const struct rzv2h_hw_info rzg3e_hw_params = { + .tssel_lut = rzg3e_tssel_lut, + .t_offs = ICU_RZG3E_TINT_OFFSET, + .max_tssel = ICU_RZG3E_TSSEL_MAX_VAL, + .field_width = 16, +}; + +static const struct rzv2h_hw_info rzv2h_hw_params = { + .t_offs = 0, + .max_tssel = ICU_RZV2H_TSSEL_MAX_VAL, + .field_width = 8, +}; + +static int rzg3e_icu_init(struct device_node *node, struct device_node *parent) +{ + return rzv2h_icu_init_common(node, parent, &rzg3e_hw_params); +} + +static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +{ + return rzv2h_icu_init_common(node, parent, &rzv2h_hw_params); +} + IRQCHIP_PLATFORM_DRIVER_BEGIN(rzv2h_icu) +IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_init) IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_init) IRQCHIP_PLATFORM_DRIVER_END(rzv2h_icu) MODULE_AUTHOR("Fabrizio Castro "); diff --git a/drivers/irqchip/irq-riscv-aplic-direct.c b/drivers/irqchip/irq-riscv-aplic-direct.c index 7cd6b646774b9..205ad61d15e49 100644 --- a/drivers/irqchip/irq-riscv-aplic-direct.c +++ b/drivers/irqchip/irq-riscv-aplic-direct.c @@ -31,7 +31,7 @@ struct aplic_direct { }; struct aplic_idc { - unsigned int hart_index; + u32 hart_index; void __iomem *regs; struct aplic_direct *direct; }; @@ -219,6 +219,20 @@ static int aplic_direct_parse_parent_hwirq(struct device *dev, u32 index, return 0; } +static int aplic_direct_get_hart_index(struct device *dev, u32 logical_index, + u32 *hart_index) +{ + const char *prop_hart_index = "riscv,hart-indexes"; + struct device_node *np = to_of_node(dev->fwnode); + + if (!np || !of_property_present(np, prop_hart_index)) { + *hart_index = logical_index; + return 0; + } + + return of_property_read_u32_index(np, prop_hart_index, logical_index, hart_index); +} + int aplic_direct_setup(struct device *dev, void __iomem *regs) { int i, j, rc, cpu, current_cpu, setup_count = 0; @@ -265,8 +279,12 @@ int aplic_direct_setup(struct device *dev, void __iomem *regs) cpumask_set_cpu(cpu, &direct->lmask); idc = per_cpu_ptr(&aplic_idcs, cpu); - idc->hart_index = i; - idc->regs = priv->regs + APLIC_IDC_BASE + i * APLIC_IDC_SIZE; + rc = aplic_direct_get_hart_index(dev, i, &idc->hart_index); + if (rc) { + dev_warn(dev, "hart index not found for IDC%d\n", i); + continue; + } + idc->regs = priv->regs + APLIC_IDC_BASE + idc->hart_index * APLIC_IDC_SIZE; idc->direct = direct; aplic_idc_set_delivery(idc, true); diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index 275df50057057..d9ae87808651a 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -73,10 +73,16 @@ static int __init imsic_ipi_domain_init(void) { return 0; } static void imsic_handle_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - int err, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct imsic_vector *vec; unsigned long local_id; + /* + * Process pending local synchronization instead of waiting + * for per-CPU local timer to expire. + */ + imsic_local_sync_all(false); + chained_irq_enter(chip, desc); while ((local_id = csr_swap(CSR_TOPEI, 0))) { @@ -97,9 +103,7 @@ static void imsic_handle_irq(struct irq_desc *desc) continue; } - err = generic_handle_domain_irq(imsic->base_domain, vec->hwirq); - if (unlikely(err)) - pr_warn_ratelimited("hwirq 0x%x mapping not found\n", vec->hwirq); + generic_handle_irq(vec->irq); } chained_irq_exit(chip, desc); @@ -120,7 +124,7 @@ static int imsic_starting_cpu(unsigned int cpu) * Interrupts identities might have been enabled/disabled while * this CPU was not running so sync-up local enable/disable state. */ - imsic_local_sync_all(); + imsic_local_sync_all(true); /* Enable local interrupt delivery */ imsic_local_delivery(true); diff --git a/drivers/irqchip/irq-riscv-imsic-platform.c b/drivers/irqchip/irq-riscv-imsic-platform.c index c708780e8760f..b8ae67c25b37a 100644 --- a/drivers/irqchip/irq-riscv-imsic-platform.c +++ b/drivers/irqchip/irq-riscv-imsic-platform.c @@ -20,6 +20,7 @@ #include #include +#include "irq-msi-lib.h" #include "irq-riscv-imsic-state.h" static bool imsic_cpu_page_phys(unsigned int cpu, unsigned int guest_index, @@ -63,6 +64,11 @@ static int imsic_irq_retrigger(struct irq_data *d) return 0; } +static void imsic_irq_ack(struct irq_data *d) +{ + irq_move_irq(d); +} + static void imsic_irq_compose_vector_msg(struct imsic_vector *vec, struct msi_msg *msg) { phys_addr_t msi_addr; @@ -96,9 +102,23 @@ static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask bool force) { struct imsic_vector *old_vec, *new_vec; - struct irq_data *pd = d->parent_data; + struct imsic_vector tmp_vec; + + /* + * Requirements for the downstream irqdomains (or devices): + * + * 1) Downstream irqdomains (or devices) with atomic MSI update can + * happily do imsic_irq_set_affinity() in the process-context on + * any CPU so the irqchip of such irqdomains must not set the + * IRQCHIP_MOVE_DEFERRED flag. + * + * 2) Downstream irqdomains (or devices) with non-atomic MSI update + * must use imsic_irq_set_affinity() in nterrupt-context upon + * the next device interrupt so the irqchip of such irqdomains + * must set the IRQCHIP_MOVE_DEFERRED flag. + */ - old_vec = irq_data_get_irq_chip_data(pd); + old_vec = irq_data_get_irq_chip_data(d); if (WARN_ON(!old_vec)) return -ENOENT; @@ -111,34 +131,95 @@ static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask return -EBUSY; /* Get a new vector on the desired set of CPUs */ - new_vec = imsic_vector_alloc(old_vec->hwirq, mask_val); + new_vec = imsic_vector_alloc(old_vec->irq, mask_val); if (!new_vec) return -ENOSPC; + /* + * Device having non-atomic MSI update might see an intermediate + * state when changing target IMSIC vector from one CPU to another. + * + * To avoid losing interrupt to such intermediate state, do the + * following (just like x86 APIC): + * + * 1) First write a temporary IMSIC vector to the device which + * has MSI address same as the old IMSIC vector but MSI data + * matches the new IMSIC vector. + * + * 2) Next write the new IMSIC vector to the device. + * + * Based on the above, __imsic_local_sync() must check pending + * status of both old MSI data and new MSI data on the old CPU. + */ + if (!irq_can_move_in_process_context(d) && + new_vec->local_id != old_vec->local_id) { + /* Setup temporary vector */ + tmp_vec.cpu = old_vec->cpu; + tmp_vec.local_id = new_vec->local_id; + + /* Point device to the temporary vector */ + imsic_msi_update_msg(irq_get_irq_data(d->irq), &tmp_vec); + } + /* Point device to the new vector */ - imsic_msi_update_msg(d, new_vec); + imsic_msi_update_msg(irq_get_irq_data(d->irq), new_vec); /* Update irq descriptors with the new vector */ - pd->chip_data = new_vec; + d->chip_data = new_vec; - /* Update effective affinity of parent irq data */ - irq_data_update_effective_affinity(pd, cpumask_of(new_vec->cpu)); + /* Update effective affinity */ + irq_data_update_effective_affinity(d, cpumask_of(new_vec->cpu)); /* Move state of the old vector to the new vector */ imsic_vector_move(old_vec, new_vec); return IRQ_SET_MASK_OK_DONE; } + +static void imsic_irq_force_complete_move(struct irq_data *d) +{ + struct imsic_vector *mvec, *vec = irq_data_get_irq_chip_data(d); + unsigned int cpu = smp_processor_id(); + + if (WARN_ON(!vec)) + return; + + /* Do nothing if there is no in-flight move */ + mvec = imsic_vector_get_move(vec); + if (!mvec) + return; + + /* Do nothing if the old IMSIC vector does not belong to current CPU */ + if (mvec->cpu != cpu) + return; + + /* + * The best we can do is force cleanup the old IMSIC vector. + * + * The challenges over here are same as x86 vector domain so + * refer to the comments in irq_force_complete_move() function + * implemented at arch/x86/kernel/apic/vector.c. + */ + + /* Force cleanup in-flight move */ + pr_info("IRQ fixup: irq %d move in progress, old vector cpu %d local_id %d\n", + d->irq, mvec->cpu, mvec->local_id); + imsic_vector_force_move_cleanup(vec); +} #endif static struct irq_chip imsic_irq_base_chip = { - .name = "IMSIC", - .irq_mask = imsic_irq_mask, - .irq_unmask = imsic_irq_unmask, - .irq_retrigger = imsic_irq_retrigger, - .irq_compose_msi_msg = imsic_irq_compose_msg, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_MASK_ON_SUSPEND, + .name = "IMSIC", + .irq_mask = imsic_irq_mask, + .irq_unmask = imsic_irq_unmask, +#ifdef CONFIG_SMP + .irq_set_affinity = imsic_irq_set_affinity, + .irq_force_complete_move = imsic_irq_force_complete_move, +#endif + .irq_retrigger = imsic_irq_retrigger, + .irq_ack = imsic_irq_ack, + .irq_compose_msi_msg = imsic_irq_compose_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static int imsic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -155,7 +236,7 @@ static int imsic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, return -ENOSPC; irq_domain_set_info(domain, virq, virq, &imsic_irq_base_chip, vec, - handle_simple_irq, NULL, NULL); + handle_edge_irq, NULL, NULL); irq_set_noprobe(virq); irq_set_affinity(virq, cpu_online_mask); irq_data_update_effective_affinity(irq_get_irq_data(virq), cpumask_of(vec->cpu)); @@ -172,22 +253,6 @@ static void imsic_irq_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_parent(domain, virq, nr_irqs); } -static int imsic_irq_domain_select(struct irq_domain *domain, struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token) -{ - const struct msi_parent_ops *ops = domain->msi_parent_ops; - u32 busmask = BIT(bus_token); - - if (fwspec->fwnode != domain->fwnode || fwspec->param_count != 0) - return 0; - - /* Handle pure domain searches */ - if (bus_token == ops->bus_select_token) - return 1; - - return !!(ops->bus_select_mask & busmask); -} - #ifdef CONFIG_GENERIC_IRQ_DEBUGFS static void imsic_irq_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) @@ -204,107 +269,37 @@ static void imsic_irq_debug_show(struct seq_file *m, struct irq_domain *d, static const struct irq_domain_ops imsic_base_domain_ops = { .alloc = imsic_irq_domain_alloc, .free = imsic_irq_domain_free, - .select = imsic_irq_domain_select, + .select = msi_lib_irq_domain_select, #ifdef CONFIG_GENERIC_IRQ_DEBUGFS .debug_show = imsic_irq_debug_show, #endif }; -#ifdef CONFIG_RISCV_IMSIC_PCI - -static void imsic_pci_mask_irq(struct irq_data *d) -{ - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} - -static void imsic_pci_unmask_irq(struct irq_data *d) +static bool imsic_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) { - irq_chip_unmask_parent(d); - pci_msi_unmask_irq(d); -} - -#define MATCH_PCI_MSI BIT(DOMAIN_BUS_PCI_MSI) - -#else - -#define MATCH_PCI_MSI 0 - -#endif - -static bool imsic_init_dev_msi_info(struct device *dev, - struct irq_domain *domain, - struct irq_domain *real_parent, - struct msi_domain_info *info) -{ - const struct msi_parent_ops *pops = real_parent->msi_parent_ops; - - /* MSI parent domain specific settings */ - switch (real_parent->bus_token) { - case DOMAIN_BUS_NEXUS: - if (WARN_ON_ONCE(domain != real_parent)) - return false; -#ifdef CONFIG_SMP - info->chip->irq_set_affinity = imsic_irq_set_affinity; -#endif - break; - default: - WARN_ON_ONCE(1); + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) return false; - } - /* Is the target supported? */ switch (info->bus_token) { -#ifdef CONFIG_RISCV_IMSIC_PCI case DOMAIN_BUS_PCI_DEVICE_MSI: case DOMAIN_BUS_PCI_DEVICE_MSIX: - info->chip->irq_mask = imsic_pci_mask_irq; - info->chip->irq_unmask = imsic_pci_unmask_irq; - break; -#endif - case DOMAIN_BUS_DEVICE_MSI: - /* - * Per-device MSI should never have any MSI feature bits - * set. It's sole purpose is to create a dumb interrupt - * chip which has a device specific irq_write_msi_msg() - * callback. - */ - if (WARN_ON_ONCE(info->flags)) - return false; - - /* Core managed MSI descriptors */ - info->flags |= MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS | - MSI_FLAG_FREE_MSI_DESCS; - break; - case DOMAIN_BUS_WIRED_TO_MSI: + info->chip->flags |= IRQCHIP_MOVE_DEFERRED; break; default: - WARN_ON_ONCE(1); - return false; + break; } - /* Use hierarchial chip operations re-trigger */ - info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; - - /* - * Mask out the domain specific MSI feature flags which are not - * supported by the real parent. - */ - info->flags &= pops->supported_flags; - - /* Enforce the required flags */ - info->flags |= pops->required_flags; - return true; } -#define MATCH_PLATFORM_MSI BIT(DOMAIN_BUS_PLATFORM_MSI) - static const struct msi_parent_ops imsic_msi_parent_ops = { .supported_flags = MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX, .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | - MSI_FLAG_USE_DEF_CHIP_OPS, + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSI_MASK_PARENT, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .init_dev_msi_info = imsic_init_dev_msi_info, diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index b97e6cd89ed74..bdf5cd2037f28 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -124,10 +124,11 @@ void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, } } -static void __imsic_local_sync(struct imsic_local_priv *lpriv) +static bool __imsic_local_sync(struct imsic_local_priv *lpriv) { - struct imsic_local_config *mlocal; - struct imsic_vector *vec, *mvec; + struct imsic_local_config *tlocal, *mlocal; + struct imsic_vector *vec, *tvec, *mvec; + bool ret = true; int i; lockdep_assert_held(&lpriv->lock); @@ -143,35 +144,97 @@ static void __imsic_local_sync(struct imsic_local_priv *lpriv) __imsic_id_clear_enable(i); /* - * If the ID was being moved to a new ID on some other CPU - * then we can get a MSI during the movement so check the - * ID pending bit and re-trigger the new ID on other CPU - * using MMIO write. + * Clear the previous vector pointer of the new vector only + * after the movement is complete on the old CPU. */ - mvec = READ_ONCE(vec->move); - WRITE_ONCE(vec->move, NULL); - if (mvec && mvec != vec) { - if (__imsic_id_read_clear_pending(i)) { + mvec = READ_ONCE(vec->move_prev); + if (mvec) { + /* + * If the old vector has not been updated then + * try again in the next sync-up call. + */ + if (READ_ONCE(mvec->move_next)) { + ret = false; + continue; + } + + WRITE_ONCE(vec->move_prev, NULL); + } + + /* + * If a vector was being moved to a new vector on some other + * CPU then we can get a MSI during the movement so check the + * ID pending bit and re-trigger the new ID on other CPU using + * MMIO write. + */ + mvec = READ_ONCE(vec->move_next); + if (mvec) { + /* + * Devices having non-atomic MSI update might see + * an intermediate state so check both old ID and + * new ID for pending interrupts. + * + * For details, see imsic_irq_set_affinity(). + */ + tvec = vec->local_id == mvec->local_id ? + NULL : &lpriv->vectors[mvec->local_id]; + + if (tvec && !irq_can_move_in_process_context(irq_get_irq_data(vec->irq)) && + __imsic_id_read_clear_pending(tvec->local_id)) { + /* Retrigger temporary vector if it was already in-use */ + if (READ_ONCE(tvec->enable)) { + tlocal = per_cpu_ptr(imsic->global.local, tvec->cpu); + writel_relaxed(tvec->local_id, tlocal->msi_va); + } + + mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu); + writel_relaxed(mvec->local_id, mlocal->msi_va); + } + + if (__imsic_id_read_clear_pending(vec->local_id)) { mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu); writel_relaxed(mvec->local_id, mlocal->msi_va); } - imsic_vector_free(&lpriv->vectors[i]); + WRITE_ONCE(vec->move_next, NULL); + imsic_vector_free(vec); } skip: bitmap_clear(lpriv->dirty_bitmap, i, 1); } + + return ret; } -void imsic_local_sync_all(void) +#ifdef CONFIG_SMP +static void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +{ + lockdep_assert_held(&lpriv->lock); + + if (!timer_pending(&lpriv->timer)) { + lpriv->timer.expires = jiffies + 1; + add_timer_on(&lpriv->timer, smp_processor_id()); + } +} +#else +static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +{ +} +#endif + +void imsic_local_sync_all(bool force_all) { struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv); unsigned long flags; raw_spin_lock_irqsave(&lpriv->lock, flags); - bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1); - __imsic_local_sync(lpriv); + + if (force_all) + bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1); + if (!__imsic_local_sync(lpriv)) + __imsic_local_timer_start(lpriv); + raw_spin_unlock_irqrestore(&lpriv->lock, flags); } @@ -190,12 +253,7 @@ void imsic_local_delivery(bool enable) #ifdef CONFIG_SMP static void imsic_local_timer_callback(struct timer_list *timer) { - struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv); - unsigned long flags; - - raw_spin_lock_irqsave(&lpriv->lock, flags); - __imsic_local_sync(lpriv); - raw_spin_unlock_irqrestore(&lpriv->lock, flags); + imsic_local_sync_all(false); } static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu) @@ -216,14 +274,11 @@ static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu */ if (cpu_online(cpu)) { if (cpu == smp_processor_id()) { - __imsic_local_sync(lpriv); - return; + if (__imsic_local_sync(lpriv)) + return; } - if (!timer_pending(&lpriv->timer)) { - lpriv->timer.expires = jiffies + 1; - add_timer_on(&lpriv->timer, cpu); - } + __imsic_local_timer_start(lpriv); } } #else @@ -278,8 +333,26 @@ void imsic_vector_unmask(struct imsic_vector *vec) raw_spin_unlock(&lpriv->lock); } -static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsic_vector *vec, - bool new_enable, struct imsic_vector *new_move) +void imsic_vector_force_move_cleanup(struct imsic_vector *vec) +{ + struct imsic_local_priv *lpriv; + struct imsic_vector *mvec; + unsigned long flags; + + lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu); + raw_spin_lock_irqsave(&lpriv->lock, flags); + + mvec = READ_ONCE(vec->move_prev); + WRITE_ONCE(vec->move_prev, NULL); + if (mvec) + imsic_vector_free(mvec); + + raw_spin_unlock_irqrestore(&lpriv->lock, flags); +} + +static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, + struct imsic_vector *vec, bool is_old_vec, + bool new_enable, struct imsic_vector *move_vec) { unsigned long flags; bool enabled; @@ -289,7 +362,10 @@ static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsi /* Update enable and move details */ enabled = READ_ONCE(vec->enable); WRITE_ONCE(vec->enable, new_enable); - WRITE_ONCE(vec->move, new_move); + if (is_old_vec) + WRITE_ONCE(vec->move_next, move_vec); + else + WRITE_ONCE(vec->move_prev, move_vec); /* Mark the vector as dirty and synchronize */ bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1); @@ -322,8 +398,8 @@ void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_ve * interrupt on the old vector while device was being moved * to the new vector. */ - enabled = imsic_vector_move_update(old_lpriv, old_vec, false, new_vec); - imsic_vector_move_update(new_lpriv, new_vec, enabled, new_vec); + enabled = imsic_vector_move_update(old_lpriv, old_vec, true, false, new_vec); + imsic_vector_move_update(new_lpriv, new_vec, false, enabled, old_vec); } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -368,7 +444,7 @@ struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int l return &lpriv->vectors[local_id]; } -struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask) +struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask) { struct imsic_vector *vec = NULL; struct imsic_local_priv *lpriv; @@ -384,9 +460,10 @@ struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask lpriv = per_cpu_ptr(imsic->lpriv, cpu); vec = &lpriv->vectors[local_id]; - vec->hwirq = hwirq; + vec->irq = irq; vec->enable = false; - vec->move = NULL; + vec->move_next = NULL; + vec->move_prev = NULL; return vec; } @@ -396,7 +473,7 @@ void imsic_vector_free(struct imsic_vector *vec) unsigned long flags; raw_spin_lock_irqsave(&imsic->matrix_lock, flags); - vec->hwirq = UINT_MAX; + vec->irq = 0; irq_matrix_free(imsic->matrix, vec->cpu, vec->local_id, false); raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags); } @@ -455,7 +532,7 @@ static int __init imsic_local_init(void) vec = &lpriv->vectors[i]; vec->cpu = cpu; vec->local_id = i; - vec->hwirq = UINT_MAX; + vec->irq = 0; } } diff --git a/drivers/irqchip/irq-riscv-imsic-state.h b/drivers/irqchip/irq-riscv-imsic-state.h index 391e442808275..3202ffa4e849d 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.h +++ b/drivers/irqchip/irq-riscv-imsic-state.h @@ -20,10 +20,11 @@ struct imsic_vector { unsigned int cpu; unsigned int local_id; /* Details saved by driver in the vector */ - unsigned int hwirq; + unsigned int irq; /* Details accessed using local lock held */ bool enable; - struct imsic_vector *move; + struct imsic_vector *move_next; + struct imsic_vector *move_prev; }; struct imsic_local_priv { @@ -74,7 +75,7 @@ static inline void __imsic_id_clear_enable(unsigned long id) __imsic_eix_update(id, 1, false, false); } -void imsic_local_sync_all(void); +void imsic_local_sync_all(bool force_all); void imsic_local_delivery(bool enable); void imsic_vector_mask(struct imsic_vector *vec); @@ -87,14 +88,15 @@ static inline bool imsic_vector_isenabled(struct imsic_vector *vec) static inline struct imsic_vector *imsic_vector_get_move(struct imsic_vector *vec) { - return READ_ONCE(vec->move); + return READ_ONCE(vec->move_prev); } +void imsic_vector_force_move_cleanup(struct imsic_vector *vec); void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec); struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id); -struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask); +struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask); void imsic_vector_free(struct imsic_vector *vector); void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind); diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c new file mode 100644 index 0000000000000..ee682e87eb8be --- /dev/null +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SG2042 MSI Controller + * + * Copyright (C) 2024 Sophgo Technology Inc. + * Copyright (C) 2024 Chen Wang + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "irq-msi-lib.h" + +#define SG2042_MAX_MSI_VECTOR 32 + +struct sg2042_msi_chipdata { + void __iomem *reg_clr; // clear reg, see TRM, 10.1.33, GP_INTR0_CLR + + phys_addr_t doorbell_addr; // see TRM, 10.1.32, GP_INTR0_SET + + u32 irq_first; // The vector number that MSIs starts + u32 num_irqs; // The number of vectors for MSIs + + DECLARE_BITMAP(msi_map, SG2042_MAX_MSI_VECTOR); + struct mutex msi_map_lock; // lock for msi_map +}; + +static int sg2042_msi_allocate_hwirq(struct sg2042_msi_chipdata *data, int num_req) +{ + int first; + + guard(mutex)(&data->msi_map_lock); + first = bitmap_find_free_region(data->msi_map, data->num_irqs, + get_count_order(num_req)); + return first >= 0 ? first : -ENOSPC; +} + +static void sg2042_msi_free_hwirq(struct sg2042_msi_chipdata *data, int hwirq, int num_req) +{ + guard(mutex)(&data->msi_map_lock); + bitmap_release_region(data->msi_map, hwirq, get_count_order(num_req)); +} + +static void sg2042_msi_irq_ack(struct irq_data *d) +{ + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + int bit_off = d->hwirq; + + writel(1 << bit_off, data->reg_clr); + + irq_chip_ack_parent(d); +} + +static void sg2042_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + + msg->address_hi = upper_32_bits(data->doorbell_addr); + msg->address_lo = lower_32_bits(data->doorbell_addr); + msg->data = 1 << d->hwirq; +} + +static const struct irq_chip sg2042_msi_middle_irq_chip = { + .name = "SG2042 MSI", + .irq_ack = sg2042_msi_irq_ack, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, +#ifdef CONFIG_SMP + .irq_set_affinity = irq_chip_set_affinity_parent, +#endif + .irq_compose_msi_msg = sg2042_msi_irq_compose_msi_msg, +}; + +static int sg2042_msi_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, int hwirq) +{ + struct sg2042_msi_chipdata *data = domain->host_data; + struct irq_fwspec fwspec; + struct irq_data *d; + int ret; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 2; + fwspec.param[0] = data->irq_first + hwirq; + fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) + return ret; + + d = irq_domain_get_irq_data(domain->parent, virq); + return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); +} + +static int sg2042_msi_middle_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct sg2042_msi_chipdata *data = domain->host_data; + int hwirq, err, i; + + hwirq = sg2042_msi_allocate_hwirq(data, nr_irqs); + if (hwirq < 0) + return hwirq; + + for (i = 0; i < nr_irqs; i++) { + err = sg2042_msi_parent_domain_alloc(domain, virq + i, hwirq + i); + if (err) + goto err_hwirq; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &sg2042_msi_middle_irq_chip, data); + } + + return 0; + +err_hwirq: + sg2042_msi_free_hwirq(data, hwirq, nr_irqs); + irq_domain_free_irqs_parent(domain, virq, i); + + return err; +} + +static void sg2042_msi_middle_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + sg2042_msi_free_hwirq(data, d->hwirq, nr_irqs); +} + +static const struct irq_domain_ops sg2042_msi_middle_domain_ops = { + .alloc = sg2042_msi_middle_domain_alloc, + .free = sg2042_msi_middle_domain_free, + .select = msi_lib_irq_domain_select, +}; + +#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS) + +#define SG2042_MSI_FLAGS_SUPPORTED MSI_GENERIC_FLAGS_MASK + +static const struct msi_parent_ops sg2042_msi_parent_ops = { + .required_flags = SG2042_MSI_FLAGS_REQUIRED, + .supported_flags = SG2042_MSI_FLAGS_SUPPORTED, + .bus_select_mask = MATCH_PCI_MSI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .prefix = "SG2042-", + .init_dev_msi_info = msi_lib_init_dev_msi_info, +}; + +static int sg2042_msi_init_domains(struct sg2042_msi_chipdata *data, + struct irq_domain *plic_domain, struct device *dev) +{ + struct fwnode_handle *fwnode = dev_fwnode(dev); + struct irq_domain *middle_domain; + + middle_domain = irq_domain_create_hierarchy(plic_domain, 0, data->num_irqs, fwnode, + &sg2042_msi_middle_domain_ops, data); + if (!middle_domain) { + pr_err("Failed to create the MSI middle domain\n"); + return -ENOMEM; + } + + irq_domain_update_bus_token(middle_domain, DOMAIN_BUS_NEXUS); + + middle_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + middle_domain->msi_parent_ops = &sg2042_msi_parent_ops; + + return 0; +} + +static int sg2042_msi_probe(struct platform_device *pdev) +{ + struct fwnode_reference_args args = { }; + struct sg2042_msi_chipdata *data; + struct device *dev = &pdev->dev; + struct irq_domain *plic_domain; + struct resource *res; + int ret; + + data = devm_kzalloc(dev, sizeof(struct sg2042_msi_chipdata), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->reg_clr = devm_platform_ioremap_resource_byname(pdev, "clr"); + if (IS_ERR(data->reg_clr)) { + dev_err(dev, "Failed to map clear register\n"); + return PTR_ERR(data->reg_clr); + } + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "doorbell"); + if (!res) { + dev_err(dev, "Failed get resource from set\n"); + return -EINVAL; + } + data->doorbell_addr = res->start; + + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "msi-ranges", + "#interrupt-cells", 0, 0, &args); + if (ret) { + dev_err(dev, "Unable to parse MSI vec base\n"); + return ret; + } + fwnode_handle_put(args.fwnode); + + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "msi-ranges", NULL, + args.nargs + 1, 0, &args); + if (ret) { + dev_err(dev, "Unable to parse MSI vec number\n"); + return ret; + } + + plic_domain = irq_find_matching_fwnode(args.fwnode, DOMAIN_BUS_ANY); + fwnode_handle_put(args.fwnode); + if (!plic_domain) { + pr_err("Failed to find the PLIC domain\n"); + return -ENXIO; + } + + data->irq_first = (u32)args.args[0]; + data->num_irqs = (u32)args.args[args.nargs - 1]; + + mutex_init(&data->msi_map_lock); + + return sg2042_msi_init_domains(data, plic_domain, dev); +} + +static const struct of_device_id sg2042_msi_of_match[] = { + { .compatible = "sophgo,sg2042-msi" }, + { } +}; + +static struct platform_driver sg2042_msi_driver = { + .driver = { + .name = "sg2042-msi", + .of_match_table = sg2042_msi_of_match, + }, + .probe = sg2042_msi_probe, +}; +builtin_platform_driver(sg2042_msi_driver); diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c index 0b43121520243..01b0d83217285 100644 --- a/drivers/irqchip/irq-sunxi-nmi.c +++ b/drivers/irqchip/irq-sunxi-nmi.c @@ -48,32 +48,41 @@ enum { SUNXI_SRC_TYPE_EDGE_RISING, }; -struct sunxi_sc_nmi_reg_offs { - u32 ctrl; - u32 pend; - u32 enable; +struct sunxi_sc_nmi_data { + struct { + u32 ctrl; + u32 pend; + u32 enable; + } reg_offs; + u32 enable_val; }; -static const struct sunxi_sc_nmi_reg_offs sun6i_reg_offs __initconst = { - .ctrl = SUN6I_NMI_CTRL, - .pend = SUN6I_NMI_PENDING, - .enable = SUN6I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun6i_data __initconst = { + .reg_offs.ctrl = SUN6I_NMI_CTRL, + .reg_offs.pend = SUN6I_NMI_PENDING, + .reg_offs.enable = SUN6I_NMI_ENABLE, }; -static const struct sunxi_sc_nmi_reg_offs sun7i_reg_offs __initconst = { - .ctrl = SUN7I_NMI_CTRL, - .pend = SUN7I_NMI_PENDING, - .enable = SUN7I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun7i_data __initconst = { + .reg_offs.ctrl = SUN7I_NMI_CTRL, + .reg_offs.pend = SUN7I_NMI_PENDING, + .reg_offs.enable = SUN7I_NMI_ENABLE, }; -static const struct sunxi_sc_nmi_reg_offs sun9i_reg_offs __initconst = { - .ctrl = SUN9I_NMI_CTRL, - .pend = SUN9I_NMI_PENDING, - .enable = SUN9I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun9i_data __initconst = { + .reg_offs.ctrl = SUN9I_NMI_CTRL, + .reg_offs.pend = SUN9I_NMI_PENDING, + .reg_offs.enable = SUN9I_NMI_ENABLE, }; -static inline void sunxi_sc_nmi_write(struct irq_chip_generic *gc, u32 off, - u32 val) +static const struct sunxi_sc_nmi_data sun55i_a523_data __initconst = { + .reg_offs.ctrl = SUN9I_NMI_CTRL, + .reg_offs.pend = SUN9I_NMI_PENDING, + .reg_offs.enable = SUN9I_NMI_ENABLE, + .enable_val = BIT(31), +}; + +static inline void sunxi_sc_nmi_write(struct irq_chip_generic *gc, u32 off, u32 val) { irq_reg_writel(gc, val, off); } @@ -143,15 +152,13 @@ static int sunxi_sc_nmi_set_type(struct irq_data *data, unsigned int flow_type) } static int __init sunxi_sc_nmi_irq_init(struct device_node *node, - const struct sunxi_sc_nmi_reg_offs *reg_offs) + const struct sunxi_sc_nmi_data *data) { - struct irq_domain *domain; + unsigned int irq, clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct irq_chip_generic *gc; - unsigned int irq; - unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + struct irq_domain *domain; int ret; - domain = irq_domain_add_linear(node, 1, &irq_generic_chip_ops, NULL); if (!domain) { pr_err("Could not register interrupt domain.\n"); @@ -186,27 +193,28 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, gc->chip_types[0].chip.irq_unmask = irq_gc_mask_set_bit; gc->chip_types[0].chip.irq_eoi = irq_gc_ack_set_bit; gc->chip_types[0].chip.irq_set_type = sunxi_sc_nmi_set_type; - gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | IRQCHIP_EOI_IF_HANDLED | + gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | + IRQCHIP_EOI_IF_HANDLED | IRQCHIP_SKIP_SET_WAKE; - gc->chip_types[0].regs.ack = reg_offs->pend; - gc->chip_types[0].regs.mask = reg_offs->enable; - gc->chip_types[0].regs.type = reg_offs->ctrl; + gc->chip_types[0].regs.ack = data->reg_offs.pend; + gc->chip_types[0].regs.mask = data->reg_offs.enable; + gc->chip_types[0].regs.type = data->reg_offs.ctrl; gc->chip_types[1].type = IRQ_TYPE_EDGE_BOTH; gc->chip_types[1].chip.irq_ack = irq_gc_ack_set_bit; gc->chip_types[1].chip.irq_mask = irq_gc_mask_clr_bit; gc->chip_types[1].chip.irq_unmask = irq_gc_mask_set_bit; gc->chip_types[1].chip.irq_set_type = sunxi_sc_nmi_set_type; - gc->chip_types[1].regs.ack = reg_offs->pend; - gc->chip_types[1].regs.mask = reg_offs->enable; - gc->chip_types[1].regs.type = reg_offs->ctrl; + gc->chip_types[1].regs.ack = data->reg_offs.pend; + gc->chip_types[1].regs.mask = data->reg_offs.enable; + gc->chip_types[1].regs.type = data->reg_offs.ctrl; gc->chip_types[1].handler = handle_edge_irq; /* Disable any active interrupts */ - sunxi_sc_nmi_write(gc, reg_offs->enable, 0); + sunxi_sc_nmi_write(gc, data->reg_offs.enable, data->enable_val); /* Clear any pending NMI interrupts */ - sunxi_sc_nmi_write(gc, reg_offs->pend, SUNXI_NMI_IRQ_BIT); + sunxi_sc_nmi_write(gc, data->reg_offs.pend, SUNXI_NMI_IRQ_BIT); irq_set_chained_handler_and_data(irq, sunxi_sc_nmi_handle_irq, domain); @@ -221,20 +229,27 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, static int __init sun6i_sc_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun6i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun6i_data); } IRQCHIP_DECLARE(sun6i_sc_nmi, "allwinner,sun6i-a31-sc-nmi", sun6i_sc_nmi_irq_init); static int __init sun7i_sc_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun7i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun7i_data); } IRQCHIP_DECLARE(sun7i_sc_nmi, "allwinner,sun7i-a20-sc-nmi", sun7i_sc_nmi_irq_init); static int __init sun9i_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun9i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun9i_data); } IRQCHIP_DECLARE(sun9i_nmi, "allwinner,sun9i-a80-nmi", sun9i_nmi_irq_init); + +static int __init sun55i_nmi_irq_init(struct device_node *node, + struct device_node *parent) +{ + return sunxi_sc_nmi_irq_init(node, &sun55i_a523_data); +} +IRQCHIP_DECLARE(sun55i_nmi, "allwinner,sun55i-a523-nmi", sun55i_nmi_irq_init); diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c index 74b2f124116e3..52d77546aacb9 100644 --- a/drivers/irqchip/qcom-pdc.c +++ b/drivers/irqchip/qcom-pdc.c @@ -21,9 +21,11 @@ #include #define PDC_MAX_GPIO_IRQS 256 +#define PDC_DRV_OFFSET 0x10000 /* Valid only on HW version < 3.2 */ #define IRQ_ENABLE_BANK 0x10 +#define IRQ_ENABLE_BANK_MAX (IRQ_ENABLE_BANK + BITS_TO_BYTES(PDC_MAX_GPIO_IRQS)) #define IRQ_i_CFG 0x110 /* Valid only on HW version >= 3.2 */ @@ -46,13 +48,20 @@ struct pdc_pin_region { static DEFINE_RAW_SPINLOCK(pdc_lock); static void __iomem *pdc_base; +static void __iomem *pdc_prev_base; static struct pdc_pin_region *pdc_region; static int pdc_region_cnt; static unsigned int pdc_version; +static bool pdc_x1e_quirk; + +static void pdc_base_reg_write(void __iomem *base, int reg, u32 i, u32 val) +{ + writel_relaxed(val, base + reg + i * sizeof(u32)); +} static void pdc_reg_write(int reg, u32 i, u32 val) { - writel_relaxed(val, pdc_base + reg + i * sizeof(u32)); + pdc_base_reg_write(pdc_base, reg, i, val); } static u32 pdc_reg_read(int reg, u32 i) @@ -60,6 +69,34 @@ static u32 pdc_reg_read(int reg, u32 i) return readl_relaxed(pdc_base + reg + i * sizeof(u32)); } +static void pdc_x1e_irq_enable_write(u32 bank, u32 enable) +{ + void __iomem *base; + + /* Remap the write access to work around a hardware bug on X1E */ + switch (bank) { + case 0 ... 1: + /* Use previous DRV (client) region and shift to bank 3-4 */ + base = pdc_prev_base; + bank += 3; + break; + case 2 ... 4: + /* Use our own region and shift to bank 0-2 */ + base = pdc_base; + bank -= 2; + break; + case 5: + /* No fixup required for bank 5 */ + base = pdc_base; + break; + default: + WARN_ON(1); + return; + } + + pdc_base_reg_write(base, IRQ_ENABLE_BANK, bank, enable); +} + static void __pdc_enable_intr(int pin_out, bool on) { unsigned long enable; @@ -72,7 +109,11 @@ static void __pdc_enable_intr(int pin_out, bool on) enable = pdc_reg_read(IRQ_ENABLE_BANK, index); __assign_bit(mask, &enable, on); - pdc_reg_write(IRQ_ENABLE_BANK, index, enable); + + if (pdc_x1e_quirk) + pdc_x1e_irq_enable_write(index, enable); + else + pdc_reg_write(IRQ_ENABLE_BANK, index, enable); } else { enable = pdc_reg_read(IRQ_i_CFG, pin_out); __assign_bit(IRQ_i_CFG_IRQ_ENABLE, &enable, on); @@ -324,10 +365,29 @@ static int qcom_pdc_init(struct device_node *node, struct device_node *parent) if (res_size > resource_size(&res)) pr_warn("%pOF: invalid reg size, please fix DT\n", node); + /* + * PDC has multiple DRV regions, each one provides the same set of + * registers for a particular client in the system. Due to a hardware + * bug on X1E, some writes to the IRQ_ENABLE_BANK register must be + * issued inside the previous region. This region belongs to + * a different client and is not described in the device tree. Map the + * region with the expected offset to preserve support for old DTs. + */ + if (of_device_is_compatible(node, "qcom,x1e80100-pdc")) { + pdc_prev_base = ioremap(res.start - PDC_DRV_OFFSET, IRQ_ENABLE_BANK_MAX); + if (!pdc_prev_base) { + pr_err("%pOF: unable to map previous PDC DRV region\n", node); + return -ENXIO; + } + + pdc_x1e_quirk = true; + } + pdc_base = ioremap(res.start, res_size); if (!pdc_base) { pr_err("%pOF: unable to map PDC registers\n", node); - return -ENXIO; + ret = -ENXIO; + goto fail; } pdc_version = pdc_reg_read(PDC_VERSION_REG, 0); @@ -363,6 +423,7 @@ static int qcom_pdc_init(struct device_node *node, struct device_node *parent) fail: kfree(pdc_region); iounmap(pdc_base); + iounmap(pdc_prev_base); return ret; } diff --git a/drivers/leds/trigger/ledtrig-pattern.c b/drivers/leds/trigger/ledtrig-pattern.c index aad48c2540fc8..a594bd5e22333 100644 --- a/drivers/leds/trigger/ledtrig-pattern.c +++ b/drivers/leds/trigger/ledtrig-pattern.c @@ -483,8 +483,8 @@ static int pattern_trig_activate(struct led_classdev *led_cdev) data->led_cdev = led_cdev; led_set_trigger_data(led_cdev, data); timer_setup(&data->timer, pattern_trig_timer_function, 0); - hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->hrtimer.function = pattern_trig_hrtimer_function; + hrtimer_setup(&data->hrtimer, pattern_trig_hrtimer_function, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); led_cdev->activated = true; if (led_cdev->flags & LED_INIT_DEFAULT_TRIGGER) { diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index d3d26a2c98956..118beaf447aa9 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -534,9 +534,7 @@ int mbox_controller_register(struct mbox_controller *mbox) return -EINVAL; } - hrtimer_init(&mbox->poll_hrt, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - mbox->poll_hrt.function = txdone_hrtimer; + hrtimer_setup(&mbox->poll_hrt, txdone_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); spin_lock_init(&mbox->poll_hrt_lock); } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e0..c45464b6576aa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3790,20 +3790,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: { - __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; - - watermark_percentage += ic->journal_entries / 2; - do_div(watermark_percentage, ic->journal_entries); - arg_count = 3; + arg_count = 1; /* buffer_sectors */ arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += ic->reset_recalculate_flag; arg_count += ic->discard; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'B'; - arg_count += ic->mode == 'B'; + arg_count += ic->mode != 'I'; /* interleave_sectors */ + arg_count += ic->mode == 'J'; /* journal_sectors */ + arg_count += ic->mode == 'J'; /* journal_watermark */ + arg_count += ic->mode == 'J'; /* commit_time */ + arg_count += ic->mode == 'B'; /* sectors_per_bit */ + arg_count += ic->mode == 'B'; /* bitmap_flush_interval */ arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -3822,10 +3820,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" reset_recalculate"); if (ic->discard) DMEMIT(" allow_discards"); - DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); - DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + if (ic->mode != 'I') + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); if (ic->mode == 'J') { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index b6f8e2dc7729f..3f3d29af1be47 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones) vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); + spin_lock_init(&zones->lock); /* * Since we will save up the timeouts that would have been reported but were ratelimited, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 8fc9339b00c72..70bcc3cdf2cd1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -386,10 +386,8 @@ static int raid0_set_limits(struct mddev *mddev) lim.io_opt = lim.io_min * mddev->raid_disks; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9d57a88dbd261..10ea3af40991d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3219,10 +3219,8 @@ static int raid1_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index efe93b9791677..15b9ae5bf84d8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4020,10 +4020,8 @@ static int raid10_set_queue_limits(struct mddev *mddev) lim.io_opt = lim.io_min * raid10_nr_stripes(conf); lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/media/cec/core/cec-pin.c b/drivers/media/cec/core/cec-pin.c index a70451d99ebc9..bebaa40e0eb51 100644 --- a/drivers/media/cec/core/cec-pin.c +++ b/drivers/media/cec/core/cec-pin.c @@ -1346,9 +1346,8 @@ struct cec_adapter *cec_pin_allocate_adapter(const struct cec_pin_ops *pin_ops, if (pin == NULL) return ERR_PTR(-ENOMEM); pin->ops = pin_ops; - hrtimer_init(&pin->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_set(&pin->work_pin_num_events, 0); - pin->timer.function = cec_pin_timer; + hrtimer_setup(&pin->timer, cec_pin_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); init_waitqueue_head(&pin->kthread_waitq); pin->tx_custom_low_usecs = CEC_TIM_CUSTOM_DEFAULT; pin->tx_custom_high_usecs = CEC_TIM_CUSTOM_DEFAULT; diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c index a04a1d33fadb1..b9f2c14d62b40 100644 --- a/drivers/media/pci/cx88/cx88-input.c +++ b/drivers/media/pci/cx88/cx88-input.c @@ -190,8 +190,7 @@ static int __cx88_ir_start(void *priv) ir = core->ir; if (ir->polling) { - hrtimer_init(&ir->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ir->timer.function = cx88_ir_work; + hrtimer_setup(&ir->timer, cx88_ir_work, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(&ir->timer, ktime_set(0, ir->polling * 1000000), HRTIMER_MODE_REL); diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c index d1320298a0f76..8479dc9c9a8fb 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c @@ -269,8 +269,8 @@ static int wave5_vpu_probe(struct platform_device *pdev) dev->irq = platform_get_irq(pdev, 0); if (dev->irq < 0) { dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n"); - hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - dev->hrtimer.function = &wave5_vpu_timer_callback; + hrtimer_setup(&dev->hrtimer, &wave5_vpu_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); dev->worker = kthread_run_worker(0, "vpu_irq_thread"); if (IS_ERR(dev->worker)) { dev_err(&pdev->dev, "failed to create vpu irq worker\n"); diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c index fe368aebbc139..84533fdd61aa6 100644 --- a/drivers/media/rc/pwm-ir-tx.c +++ b/drivers/media/rc/pwm-ir-tx.c @@ -172,8 +172,7 @@ static int pwm_ir_probe(struct platform_device *pdev) rcdev->tx_ir = pwm_ir_tx_sleep; } else { init_completion(&pwm_ir->tx_done); - hrtimer_init(&pwm_ir->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pwm_ir->timer.function = pwm_ir_timer; + hrtimer_setup(&pwm_ir->timer, pwm_ir_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rcdev->tx_ir = pwm_ir_tx_atomic; } diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index e0174da5e9fc3..77b0490a1b38d 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -286,7 +286,6 @@ static int rtsx_usb_get_status_with_bulk(struct rtsx_ucr *ucr, u16 *status) int rtsx_usb_get_card_status(struct rtsx_ucr *ucr, u16 *status) { int ret; - u8 interrupt_val = 0; u16 *buf; if (!status) @@ -309,20 +308,6 @@ int rtsx_usb_get_card_status(struct rtsx_ucr *ucr, u16 *status) ret = rtsx_usb_get_status_with_bulk(ucr, status); } - rtsx_usb_read_register(ucr, CARD_INT_PEND, &interrupt_val); - /* Cross check presence with interrupts */ - if (*status & XD_CD) - if (!(interrupt_val & XD_INT)) - *status &= ~XD_CD; - - if (*status & SD_CD) - if (!(interrupt_val & SD_INT)) - *status &= ~SD_CD; - - if (*status & MS_CD) - if (!(interrupt_val & MS_INT)) - *status &= ~MS_CD; - /* usb_control_msg may return positive when success */ if (ret < 0) return ret; diff --git a/drivers/misc/eeprom/digsy_mtc_eeprom.c b/drivers/misc/eeprom/digsy_mtc_eeprom.c index 88888485e6f8e..ee58f7ce5bfa9 100644 --- a/drivers/misc/eeprom/digsy_mtc_eeprom.c +++ b/drivers/misc/eeprom/digsy_mtc_eeprom.c @@ -50,7 +50,7 @@ static struct platform_device digsy_mtc_eeprom = { }; static struct gpiod_lookup_table eeprom_spi_gpiod_table = { - .dev_id = "spi_gpio", + .dev_id = "spi_gpio.1", .table = { GPIO_LOOKUP("gpio@b00", GPIO_EEPROM_CLK, "sck", GPIO_ACTIVE_HIGH), diff --git a/drivers/misc/mei/Kconfig b/drivers/misc/mei/Kconfig index 67d9391f18550..7575fee96cc6a 100644 --- a/drivers/misc/mei/Kconfig +++ b/drivers/misc/mei/Kconfig @@ -3,7 +3,7 @@ config INTEL_MEI tristate "Intel Management Engine Interface" depends on X86 && PCI - default GENERIC_CPU || MCORE2 || MATOM || X86_GENERIC + default X86_64 || MATOM help The Intel Management Engine (Intel ME) provides Manageability, Security and Media services for system containing Intel chipsets. diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index c3a6657dcd4a2..a5f88ec97df75 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -117,6 +117,8 @@ #define MEI_DEV_ID_LNL_M 0xA870 /* Lunar Lake Point M */ +#define MEI_DEV_ID_PTL_P 0xE470 /* Panther Lake P */ + /* * MEI HW Section */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 6589635f8ba32..d6ff9d82ae94b 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -124,6 +124,8 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_LNL_M, MEI_ME_PCH15_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_PTL_P, MEI_ME_PCH15_CFG)}, + /* required last entry */ {0, } }; diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index 35d349fee7698..7be1649b19725 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -502,7 +502,7 @@ static int vsc_tp_probe(struct spi_device *spi) if (ret) return ret; - tp->wakeuphost = devm_gpiod_get(dev, "wakeuphost", GPIOD_IN); + tp->wakeuphost = devm_gpiod_get(dev, "wakeuphostint", GPIOD_IN); if (IS_ERR(tp->wakeuphost)) return PTR_ERR(tp->wakeuphost); diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 055395cde42b6..999026a1ae048 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -873,6 +873,7 @@ static int setup_wait(struct ntsync_device *dev, { int fds[NTSYNC_MAX_WAIT_COUNT + 1]; const __u32 count = args->count; + size_t size = array_size(count, sizeof(fds[0])); struct ntsync_q *q; __u32 total_count; __u32 i, j; @@ -880,15 +881,14 @@ static int setup_wait(struct ntsync_device *dev, if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME)) return -EINVAL; - if (args->count > NTSYNC_MAX_WAIT_COUNT) + if (size >= sizeof(fds)) return -EINVAL; total_count = count; if (args->alert) total_count++; - if (copy_from_user(fds, u64_to_user_ptr(args->objs), - array_size(count, sizeof(*fds)))) + if (copy_from_user(fds, u64_to_user_ptr(args->objs), size)) return -EFAULT; if (args->alert) fds[count] = args->alert; @@ -1208,6 +1208,7 @@ static struct miscdevice ntsync_misc = { .minor = MISC_DYNAMIC_MINOR, .name = NTSYNC_NAME, .fops = &ntsync_fops, + .mode = 0666, }; module_misc_device(ntsync_misc); diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c index f0b1fc87490ec..26166357b255e 100644 --- a/drivers/misc/vcpu_stall_detector.c +++ b/drivers/misc/vcpu_stall_detector.c @@ -111,8 +111,7 @@ static int start_stall_detector_cpu(unsigned int cpu) ping_timeout_ms = vcpu_stall_config.stall_timeout_sec * MSEC_PER_SEC / 2; - hrtimer_init(vcpu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu_hrtimer->function = vcpu_stall_detect_timer_fn; + hrtimer_setup(vcpu_hrtimer, vcpu_stall_detect_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu_stall_detector->is_initialized = true; hrtimer_start(vcpu_hrtimer, ms_to_ktime(ping_timeout_ms), diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 3cbda98d08d28..31f40c04afda5 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1875,8 +1875,7 @@ static void dw_mci_init_fault(struct dw_mci *host) { host->fail_data_crc = (struct fault_attr) FAULT_ATTR_INITIALIZER; - hrtimer_init(&host->fault_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - host->fault_timer.function = dw_mci_fault_timer; + hrtimer_setup(&host->fault_timer, dw_mci_fault_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } #else static void dw_mci_init_fault(struct dw_mci *host) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 8d1d710e439dd..6667eea955977 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -471,6 +471,8 @@ struct cdns_nand_ctrl { struct { void __iomem *virt; dma_addr_t dma; + dma_addr_t iova_dma; + u32 size; } io; int irq; @@ -1835,11 +1837,11 @@ static int cadence_nand_slave_dma_transfer(struct cdns_nand_ctrl *cdns_ctrl, } if (dir == DMA_FROM_DEVICE) { - src_dma = cdns_ctrl->io.dma; + src_dma = cdns_ctrl->io.iova_dma; dst_dma = buf_dma; } else { src_dma = buf_dma; - dst_dma = cdns_ctrl->io.dma; + dst_dma = cdns_ctrl->io.iova_dma; } tx = dmaengine_prep_dma_memcpy(cdns_ctrl->dmac, dst_dma, src_dma, len, @@ -1861,12 +1863,12 @@ static int cadence_nand_slave_dma_transfer(struct cdns_nand_ctrl *cdns_ctrl, dma_async_issue_pending(cdns_ctrl->dmac); wait_for_completion(&finished); - dma_unmap_single(cdns_ctrl->dev, buf_dma, len, dir); + dma_unmap_single(dma_dev->dev, buf_dma, len, dir); return 0; err_unmap: - dma_unmap_single(cdns_ctrl->dev, buf_dma, len, dir); + dma_unmap_single(dma_dev->dev, buf_dma, len, dir); err: dev_dbg(cdns_ctrl->dev, "Fall back to CPU I/O\n"); @@ -2869,6 +2871,7 @@ cadence_nand_irq_cleanup(int irqnum, struct cdns_nand_ctrl *cdns_ctrl) static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) { dma_cap_mask_t mask; + struct dma_device *dma_dev = cdns_ctrl->dmac->device; int ret; cdns_ctrl->cdma_desc = dma_alloc_coherent(cdns_ctrl->dev, @@ -2904,15 +2907,24 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) dma_cap_set(DMA_MEMCPY, mask); if (cdns_ctrl->caps1->has_dma) { - cdns_ctrl->dmac = dma_request_channel(mask, NULL, NULL); - if (!cdns_ctrl->dmac) { - dev_err(cdns_ctrl->dev, - "Unable to get a DMA channel\n"); - ret = -EBUSY; + cdns_ctrl->dmac = dma_request_chan_by_mask(&mask); + if (IS_ERR(cdns_ctrl->dmac)) { + ret = dev_err_probe(cdns_ctrl->dev, PTR_ERR(cdns_ctrl->dmac), + "%d: Failed to get a DMA channel\n", ret); goto disable_irq; } } + cdns_ctrl->io.iova_dma = dma_map_resource(dma_dev->dev, cdns_ctrl->io.dma, + cdns_ctrl->io.size, + DMA_BIDIRECTIONAL, 0); + + ret = dma_mapping_error(dma_dev->dev, cdns_ctrl->io.iova_dma); + if (ret) { + dev_err(cdns_ctrl->dev, "Failed to map I/O resource to DMA\n"); + goto dma_release_chnl; + } + nand_controller_init(&cdns_ctrl->controller); INIT_LIST_HEAD(&cdns_ctrl->chips); @@ -2923,18 +2935,22 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) if (ret) { dev_err(cdns_ctrl->dev, "Failed to register MTD: %d\n", ret); - goto dma_release_chnl; + goto unmap_dma_resource; } kfree(cdns_ctrl->buf); cdns_ctrl->buf = kzalloc(cdns_ctrl->buf_size, GFP_KERNEL); if (!cdns_ctrl->buf) { ret = -ENOMEM; - goto dma_release_chnl; + goto unmap_dma_resource; } return 0; +unmap_dma_resource: + dma_unmap_resource(dma_dev->dev, cdns_ctrl->io.iova_dma, + cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); + dma_release_chnl: if (cdns_ctrl->dmac) dma_release_channel(cdns_ctrl->dmac); @@ -2956,6 +2972,10 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) static void cadence_nand_remove(struct cdns_nand_ctrl *cdns_ctrl) { cadence_nand_chips_cleanup(cdns_ctrl); + if (cdns_ctrl->dmac) + dma_unmap_resource(cdns_ctrl->dmac->device->dev, + cdns_ctrl->io.iova_dma, cdns_ctrl->io.size, + DMA_BIDIRECTIONAL, 0); cadence_nand_irq_cleanup(cdns_ctrl->irq, cdns_ctrl); kfree(cdns_ctrl->buf); dma_free_coherent(cdns_ctrl->dev, sizeof(struct cadence_nand_cdma_desc), @@ -3020,7 +3040,9 @@ static int cadence_nand_dt_probe(struct platform_device *ofdev) cdns_ctrl->io.virt = devm_platform_get_and_ioremap_resource(ofdev, 1, &res); if (IS_ERR(cdns_ctrl->io.virt)) return PTR_ERR(cdns_ctrl->io.virt); + cdns_ctrl->io.dma = res->start; + cdns_ctrl->io.size = resource_size(res); dt->clk = devm_clk_get(cdns_ctrl->dev, "nf_clk"); if (IS_ERR(dt->clk)) diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index d2d2aeee42a7e..6720b547892ba 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -1881,18 +1881,18 @@ static int qcom_param_page_type_exec(struct nand_chip *chip, const struct nand_ nandc->regs->addr0 = 0; nandc->regs->addr1 = 0; - host->cfg0 = FIELD_PREP(CW_PER_PAGE_MASK, 0) | - FIELD_PREP(UD_SIZE_BYTES_MASK, 512) | - FIELD_PREP(NUM_ADDR_CYCLES_MASK, 5) | - FIELD_PREP(SPARE_SIZE_BYTES_MASK, 0); - - host->cfg1 = FIELD_PREP(NAND_RECOVERY_CYCLES_MASK, 7) | - FIELD_PREP(BAD_BLOCK_BYTE_NUM_MASK, 17) | - FIELD_PREP(CS_ACTIVE_BSY, 0) | - FIELD_PREP(BAD_BLOCK_IN_SPARE_AREA, 1) | - FIELD_PREP(WR_RD_BSY_GAP_MASK, 2) | - FIELD_PREP(WIDE_FLASH, 0) | - FIELD_PREP(DEV0_CFG1_ECC_DISABLE, 1); + nandc->regs->cfg0 = cpu_to_le32(FIELD_PREP(CW_PER_PAGE_MASK, 0) | + FIELD_PREP(UD_SIZE_BYTES_MASK, 512) | + FIELD_PREP(NUM_ADDR_CYCLES_MASK, 5) | + FIELD_PREP(SPARE_SIZE_BYTES_MASK, 0)); + + nandc->regs->cfg1 = cpu_to_le32(FIELD_PREP(NAND_RECOVERY_CYCLES_MASK, 7) | + FIELD_PREP(BAD_BLOCK_BYTE_NUM_MASK, 17) | + FIELD_PREP(CS_ACTIVE_BSY, 0) | + FIELD_PREP(BAD_BLOCK_IN_SPARE_AREA, 1) | + FIELD_PREP(WR_RD_BSY_GAP_MASK, 2) | + FIELD_PREP(WIDE_FLASH, 0) | + FIELD_PREP(DEV0_CFG1_ECC_DISABLE, 1)); if (!nandc->props->qpic_version2) nandc->regs->ecc_buf_cfg = cpu_to_le32(ECC_CFG_ECC_DISABLE); diff --git a/drivers/mtd/spi-nor/sst.c b/drivers/mtd/spi-nor/sst.c index b5ad7118c49a2..175211fe6a5ed 100644 --- a/drivers/mtd/spi-nor/sst.c +++ b/drivers/mtd/spi-nor/sst.c @@ -174,7 +174,7 @@ static int sst_nor_write_data(struct spi_nor *nor, loff_t to, size_t len, int ret; nor->program_opcode = op; - ret = spi_nor_write_data(nor, to, 1, buf); + ret = spi_nor_write_data(nor, to, len, buf); if (ret < 0) return ret; WARN(ret != len, "While writing %zu byte written %i bytes\n", len, ret); diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index 7fea00c7ca8a6..c60386bf2d1a4 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -745,7 +745,7 @@ static int cfv_probe(struct virtio_device *vdev) if (cfv->vr_rx) vdev->vringh_config->del_vrhs(cfv->vdev); - if (cfv->vdev) + if (cfv->vq_tx) vdev->config->del_vqs(cfv->vdev); free_netdev(netdev); return err; diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index d025d4163fd12..884a6352c42b7 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2420,12 +2420,11 @@ int m_can_class_register(struct m_can_classdev *cdev) if (!cdev->net->irq) { dev_dbg(cdev->dev, "Polling enabled, initialize hrtimer"); - hrtimer_init(&cdev->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - cdev->hrtimer.function = &hrtimer_callback; + hrtimer_setup(&cdev->hrtimer, &hrtimer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); } else { - hrtimer_init(&cdev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cdev->hrtimer.function = m_can_coalescing_timer; + hrtimer_setup(&cdev->hrtimer, m_can_coalescing_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ret = m_can_dev_setup(cdev); diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c index 7209a831f0f20..c34f2067a989c 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c @@ -541,11 +541,11 @@ int mcp251xfd_ring_alloc(struct mcp251xfd_priv *priv) } priv->rx_ring_num = i; - hrtimer_init(&priv->rx_irq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->rx_irq_timer.function = mcp251xfd_rx_irq_timer; + hrtimer_setup(&priv->rx_irq_timer, mcp251xfd_rx_irq_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&priv->tx_irq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->tx_irq_timer.function = mcp251xfd_tx_irq_timer; + hrtimer_setup(&priv->tx_irq_timer, mcp251xfd_tx_irq_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); return 0; } diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 1c83af805209c..5883eb93efb11 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2591,7 +2591,8 @@ mt7531_setup_common(struct dsa_switch *ds) if (ret < 0) return ret; - return 0; + /* Setup VLAN ID 0 for VLAN-unaware bridges */ + return mt7530_setup_vlan0(priv); } static int @@ -2687,11 +2688,6 @@ mt7531_setup(struct dsa_switch *ds) if (ret) return ret; - /* Setup VLAN ID 0 for VLAN-unaware bridges */ - ret = mt7530_setup_vlan0(priv); - if (ret) - return ret; - ds->assisted_learning_on_cpu_port = true; ds->mtu_enforcement_ingress = true; diff --git a/drivers/net/dsa/realtek/Kconfig b/drivers/net/dsa/realtek/Kconfig index 6989972eebc30..d6eb6713e5f6b 100644 --- a/drivers/net/dsa/realtek/Kconfig +++ b/drivers/net/dsa/realtek/Kconfig @@ -43,4 +43,10 @@ config NET_DSA_REALTEK_RTL8366RB help Select to enable support for Realtek RTL8366RB. +config NET_DSA_REALTEK_RTL8366RB_LEDS + bool + depends on (LEDS_CLASS=y || LEDS_CLASS=NET_DSA_REALTEK_RTL8366RB) + depends on NET_DSA_REALTEK_RTL8366RB + default NET_DSA_REALTEK_RTL8366RB + endif diff --git a/drivers/net/dsa/realtek/Makefile b/drivers/net/dsa/realtek/Makefile index 35491dc20d6d6..17367bcba496c 100644 --- a/drivers/net/dsa/realtek/Makefile +++ b/drivers/net/dsa/realtek/Makefile @@ -12,4 +12,7 @@ endif obj-$(CONFIG_NET_DSA_REALTEK_RTL8366RB) += rtl8366.o rtl8366-objs := rtl8366-core.o rtl8366rb.o +ifdef CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS +rtl8366-objs += rtl8366rb-leds.o +endif obj-$(CONFIG_NET_DSA_REALTEK_RTL8365MB) += rtl8365mb.o diff --git a/drivers/net/dsa/realtek/rtl8366rb-leds.c b/drivers/net/dsa/realtek/rtl8366rb-leds.c new file mode 100644 index 0000000000000..99c890681ae60 --- /dev/null +++ b/drivers/net/dsa/realtek/rtl8366rb-leds.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "rtl83xx.h" +#include "rtl8366rb.h" + +static inline u32 rtl8366rb_led_group_port_mask(u8 led_group, u8 port) +{ + switch (led_group) { + case 0: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 1: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 2: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 3: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + default: + return 0; + } +} + +static int rb8366rb_get_port_led(struct rtl8366rb_led *led) +{ + struct realtek_priv *priv = led->priv; + u8 led_group = led->led_group; + u8 port_num = led->port_num; + int ret; + u32 val; + + ret = regmap_read(priv->map, RTL8366RB_LED_X_X_CTRL_REG(led_group), + &val); + if (ret) { + dev_err(priv->dev, "error reading LED on port %d group %d\n", + led_group, port_num); + return ret; + } + + return !!(val & rtl8366rb_led_group_port_mask(led_group, port_num)); +} + +static int rb8366rb_set_port_led(struct rtl8366rb_led *led, bool enable) +{ + struct realtek_priv *priv = led->priv; + u8 led_group = led->led_group; + u8 port_num = led->port_num; + int ret; + + ret = regmap_update_bits(priv->map, + RTL8366RB_LED_X_X_CTRL_REG(led_group), + rtl8366rb_led_group_port_mask(led_group, + port_num), + enable ? 0xffff : 0); + if (ret) { + dev_err(priv->dev, "error updating LED on port %d group %d\n", + led_group, port_num); + return ret; + } + + /* Change the LED group to manual controlled LEDs if required */ + ret = rb8366rb_set_ledgroup_mode(priv, led_group, + RTL8366RB_LEDGROUP_FORCE); + + if (ret) { + dev_err(priv->dev, "error updating LED GROUP group %d\n", + led_group); + return ret; + } + + return 0; +} + +static int +rtl8366rb_cled_brightness_set_blocking(struct led_classdev *ldev, + enum led_brightness brightness) +{ + struct rtl8366rb_led *led = container_of(ldev, struct rtl8366rb_led, + cdev); + + return rb8366rb_set_port_led(led, brightness == LED_ON); +} + +static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp, + struct fwnode_handle *led_fwnode) +{ + struct rtl8366rb *rb = priv->chip_data; + struct led_init_data init_data = { }; + enum led_default_state state; + struct rtl8366rb_led *led; + u32 led_group; + int ret; + + ret = fwnode_property_read_u32(led_fwnode, "reg", &led_group); + if (ret) + return ret; + + if (led_group >= RTL8366RB_NUM_LEDGROUPS) { + dev_warn(priv->dev, "Invalid LED reg %d defined for port %d", + led_group, dp->index); + return -EINVAL; + } + + led = &rb->leds[dp->index][led_group]; + led->port_num = dp->index; + led->led_group = led_group; + led->priv = priv; + + state = led_init_default_state_get(led_fwnode); + switch (state) { + case LEDS_DEFSTATE_ON: + led->cdev.brightness = 1; + rb8366rb_set_port_led(led, 1); + break; + case LEDS_DEFSTATE_KEEP: + led->cdev.brightness = + rb8366rb_get_port_led(led); + break; + case LEDS_DEFSTATE_OFF: + default: + led->cdev.brightness = 0; + rb8366rb_set_port_led(led, 0); + } + + led->cdev.max_brightness = 1; + led->cdev.brightness_set_blocking = + rtl8366rb_cled_brightness_set_blocking; + init_data.fwnode = led_fwnode; + init_data.devname_mandatory = true; + + init_data.devicename = kasprintf(GFP_KERNEL, "Realtek-%d:0%d:%d", + dp->ds->index, dp->index, led_group); + if (!init_data.devicename) + return -ENOMEM; + + ret = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data); + if (ret) { + dev_warn(priv->dev, "Failed to init LED %d for port %d", + led_group, dp->index); + return ret; + } + + return 0; +} + +int rtl8366rb_setup_leds(struct realtek_priv *priv) +{ + struct dsa_switch *ds = &priv->ds; + struct device_node *leds_np; + struct dsa_port *dp; + int ret = 0; + + dsa_switch_for_each_port(dp, ds) { + if (!dp->dn) + continue; + + leds_np = of_get_child_by_name(dp->dn, "leds"); + if (!leds_np) { + dev_dbg(priv->dev, "No leds defined for port %d", + dp->index); + continue; + } + + for_each_child_of_node_scoped(leds_np, led_np) { + ret = rtl8366rb_setup_led(priv, dp, + of_fwnode_handle(led_np)); + if (ret) + break; + } + + of_node_put(leds_np); + if (ret) + return ret; + } + return 0; +} diff --git a/drivers/net/dsa/realtek/rtl8366rb.c b/drivers/net/dsa/realtek/rtl8366rb.c index 4c4a95d4380ce..f54771cab56d4 100644 --- a/drivers/net/dsa/realtek/rtl8366rb.c +++ b/drivers/net/dsa/realtek/rtl8366rb.c @@ -27,11 +27,7 @@ #include "realtek-smi.h" #include "realtek-mdio.h" #include "rtl83xx.h" - -#define RTL8366RB_PORT_NUM_CPU 5 -#define RTL8366RB_NUM_PORTS 6 -#define RTL8366RB_PHY_NO_MAX 4 -#define RTL8366RB_PHY_ADDR_MAX 31 +#include "rtl8366rb.h" /* Switch Global Configuration register */ #define RTL8366RB_SGCR 0x0000 @@ -176,39 +172,6 @@ */ #define RTL8366RB_VLAN_INGRESS_CTRL2_REG 0x037f -/* LED control registers */ -/* The LED blink rate is global; it is used by all triggers in all groups. */ -#define RTL8366RB_LED_BLINKRATE_REG 0x0430 -#define RTL8366RB_LED_BLINKRATE_MASK 0x0007 -#define RTL8366RB_LED_BLINKRATE_28MS 0x0000 -#define RTL8366RB_LED_BLINKRATE_56MS 0x0001 -#define RTL8366RB_LED_BLINKRATE_84MS 0x0002 -#define RTL8366RB_LED_BLINKRATE_111MS 0x0003 -#define RTL8366RB_LED_BLINKRATE_222MS 0x0004 -#define RTL8366RB_LED_BLINKRATE_446MS 0x0005 - -/* LED trigger event for each group */ -#define RTL8366RB_LED_CTRL_REG 0x0431 -#define RTL8366RB_LED_CTRL_OFFSET(led_group) \ - (4 * (led_group)) -#define RTL8366RB_LED_CTRL_MASK(led_group) \ - (0xf << RTL8366RB_LED_CTRL_OFFSET(led_group)) - -/* The RTL8366RB_LED_X_X registers are used to manually set the LED state only - * when the corresponding LED group in RTL8366RB_LED_CTRL_REG is - * RTL8366RB_LEDGROUP_FORCE. Otherwise, it is ignored. - */ -#define RTL8366RB_LED_0_1_CTRL_REG 0x0432 -#define RTL8366RB_LED_2_3_CTRL_REG 0x0433 -#define RTL8366RB_LED_X_X_CTRL_REG(led_group) \ - ((led_group) <= 1 ? \ - RTL8366RB_LED_0_1_CTRL_REG : \ - RTL8366RB_LED_2_3_CTRL_REG) -#define RTL8366RB_LED_0_X_CTRL_MASK GENMASK(5, 0) -#define RTL8366RB_LED_X_1_CTRL_MASK GENMASK(11, 6) -#define RTL8366RB_LED_2_X_CTRL_MASK GENMASK(5, 0) -#define RTL8366RB_LED_X_3_CTRL_MASK GENMASK(11, 6) - #define RTL8366RB_MIB_COUNT 33 #define RTL8366RB_GLOBAL_MIB_COUNT 1 #define RTL8366RB_MIB_COUNTER_PORT_OFFSET 0x0050 @@ -244,7 +207,6 @@ #define RTL8366RB_PORT_STATUS_AN_MASK 0x0080 #define RTL8366RB_NUM_VLANS 16 -#define RTL8366RB_NUM_LEDGROUPS 4 #define RTL8366RB_NUM_VIDS 4096 #define RTL8366RB_PRIORITYMAX 7 #define RTL8366RB_NUM_FIDS 8 @@ -351,46 +313,6 @@ #define RTL8366RB_GREEN_FEATURE_TX BIT(0) #define RTL8366RB_GREEN_FEATURE_RX BIT(2) -enum rtl8366_ledgroup_mode { - RTL8366RB_LEDGROUP_OFF = 0x0, - RTL8366RB_LEDGROUP_DUP_COL = 0x1, - RTL8366RB_LEDGROUP_LINK_ACT = 0x2, - RTL8366RB_LEDGROUP_SPD1000 = 0x3, - RTL8366RB_LEDGROUP_SPD100 = 0x4, - RTL8366RB_LEDGROUP_SPD10 = 0x5, - RTL8366RB_LEDGROUP_SPD1000_ACT = 0x6, - RTL8366RB_LEDGROUP_SPD100_ACT = 0x7, - RTL8366RB_LEDGROUP_SPD10_ACT = 0x8, - RTL8366RB_LEDGROUP_SPD100_10_ACT = 0x9, - RTL8366RB_LEDGROUP_FIBER = 0xa, - RTL8366RB_LEDGROUP_AN_FAULT = 0xb, - RTL8366RB_LEDGROUP_LINK_RX = 0xc, - RTL8366RB_LEDGROUP_LINK_TX = 0xd, - RTL8366RB_LEDGROUP_MASTER = 0xe, - RTL8366RB_LEDGROUP_FORCE = 0xf, - - __RTL8366RB_LEDGROUP_MODE_MAX -}; - -struct rtl8366rb_led { - u8 port_num; - u8 led_group; - struct realtek_priv *priv; - struct led_classdev cdev; -}; - -/** - * struct rtl8366rb - RTL8366RB-specific data - * @max_mtu: per-port max MTU setting - * @pvid_enabled: if PVID is set for respective port - * @leds: per-port and per-ledgroup led info - */ -struct rtl8366rb { - unsigned int max_mtu[RTL8366RB_NUM_PORTS]; - bool pvid_enabled[RTL8366RB_NUM_PORTS]; - struct rtl8366rb_led leds[RTL8366RB_NUM_PORTS][RTL8366RB_NUM_LEDGROUPS]; -}; - static struct rtl8366_mib_counter rtl8366rb_mib_counters[] = { { 0, 0, 4, "IfInOctets" }, { 0, 4, 4, "EtherStatsOctets" }, @@ -831,9 +753,10 @@ static int rtl8366rb_jam_table(const struct rtl8366rb_jam_tbl_entry *jam_table, return 0; } -static int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, - u8 led_group, - enum rtl8366_ledgroup_mode mode) +/* This code is used also with LEDs disabled */ +int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, + u8 led_group, + enum rtl8366_ledgroup_mode mode) { int ret; u32 val; @@ -850,144 +773,7 @@ static int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, return 0; } -static inline u32 rtl8366rb_led_group_port_mask(u8 led_group, u8 port) -{ - switch (led_group) { - case 0: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 1: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 2: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 3: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - default: - return 0; - } -} - -static int rb8366rb_get_port_led(struct rtl8366rb_led *led) -{ - struct realtek_priv *priv = led->priv; - u8 led_group = led->led_group; - u8 port_num = led->port_num; - int ret; - u32 val; - - ret = regmap_read(priv->map, RTL8366RB_LED_X_X_CTRL_REG(led_group), - &val); - if (ret) { - dev_err(priv->dev, "error reading LED on port %d group %d\n", - led_group, port_num); - return ret; - } - - return !!(val & rtl8366rb_led_group_port_mask(led_group, port_num)); -} - -static int rb8366rb_set_port_led(struct rtl8366rb_led *led, bool enable) -{ - struct realtek_priv *priv = led->priv; - u8 led_group = led->led_group; - u8 port_num = led->port_num; - int ret; - - ret = regmap_update_bits(priv->map, - RTL8366RB_LED_X_X_CTRL_REG(led_group), - rtl8366rb_led_group_port_mask(led_group, - port_num), - enable ? 0xffff : 0); - if (ret) { - dev_err(priv->dev, "error updating LED on port %d group %d\n", - led_group, port_num); - return ret; - } - - /* Change the LED group to manual controlled LEDs if required */ - ret = rb8366rb_set_ledgroup_mode(priv, led_group, - RTL8366RB_LEDGROUP_FORCE); - - if (ret) { - dev_err(priv->dev, "error updating LED GROUP group %d\n", - led_group); - return ret; - } - - return 0; -} - -static int -rtl8366rb_cled_brightness_set_blocking(struct led_classdev *ldev, - enum led_brightness brightness) -{ - struct rtl8366rb_led *led = container_of(ldev, struct rtl8366rb_led, - cdev); - - return rb8366rb_set_port_led(led, brightness == LED_ON); -} - -static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp, - struct fwnode_handle *led_fwnode) -{ - struct rtl8366rb *rb = priv->chip_data; - struct led_init_data init_data = { }; - enum led_default_state state; - struct rtl8366rb_led *led; - u32 led_group; - int ret; - - ret = fwnode_property_read_u32(led_fwnode, "reg", &led_group); - if (ret) - return ret; - - if (led_group >= RTL8366RB_NUM_LEDGROUPS) { - dev_warn(priv->dev, "Invalid LED reg %d defined for port %d", - led_group, dp->index); - return -EINVAL; - } - - led = &rb->leds[dp->index][led_group]; - led->port_num = dp->index; - led->led_group = led_group; - led->priv = priv; - - state = led_init_default_state_get(led_fwnode); - switch (state) { - case LEDS_DEFSTATE_ON: - led->cdev.brightness = 1; - rb8366rb_set_port_led(led, 1); - break; - case LEDS_DEFSTATE_KEEP: - led->cdev.brightness = - rb8366rb_get_port_led(led); - break; - case LEDS_DEFSTATE_OFF: - default: - led->cdev.brightness = 0; - rb8366rb_set_port_led(led, 0); - } - - led->cdev.max_brightness = 1; - led->cdev.brightness_set_blocking = - rtl8366rb_cled_brightness_set_blocking; - init_data.fwnode = led_fwnode; - init_data.devname_mandatory = true; - - init_data.devicename = kasprintf(GFP_KERNEL, "Realtek-%d:0%d:%d", - dp->ds->index, dp->index, led_group); - if (!init_data.devicename) - return -ENOMEM; - - ret = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data); - if (ret) { - dev_warn(priv->dev, "Failed to init LED %d for port %d", - led_group, dp->index); - return ret; - } - - return 0; -} - +/* This code is used also with LEDs disabled */ static int rtl8366rb_setup_all_leds_off(struct realtek_priv *priv) { int ret = 0; @@ -1008,38 +794,6 @@ static int rtl8366rb_setup_all_leds_off(struct realtek_priv *priv) return ret; } -static int rtl8366rb_setup_leds(struct realtek_priv *priv) -{ - struct dsa_switch *ds = &priv->ds; - struct device_node *leds_np; - struct dsa_port *dp; - int ret = 0; - - dsa_switch_for_each_port(dp, ds) { - if (!dp->dn) - continue; - - leds_np = of_get_child_by_name(dp->dn, "leds"); - if (!leds_np) { - dev_dbg(priv->dev, "No leds defined for port %d", - dp->index); - continue; - } - - for_each_child_of_node_scoped(leds_np, led_np) { - ret = rtl8366rb_setup_led(priv, dp, - of_fwnode_handle(led_np)); - if (ret) - break; - } - - of_node_put(leds_np); - if (ret) - return ret; - } - return 0; -} - static int rtl8366rb_setup(struct dsa_switch *ds) { struct realtek_priv *priv = ds->priv; diff --git a/drivers/net/dsa/realtek/rtl8366rb.h b/drivers/net/dsa/realtek/rtl8366rb.h new file mode 100644 index 0000000000000..685ff3275faa1 --- /dev/null +++ b/drivers/net/dsa/realtek/rtl8366rb.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _RTL8366RB_H +#define _RTL8366RB_H + +#include "realtek.h" + +#define RTL8366RB_PORT_NUM_CPU 5 +#define RTL8366RB_NUM_PORTS 6 +#define RTL8366RB_PHY_NO_MAX 4 +#define RTL8366RB_NUM_LEDGROUPS 4 +#define RTL8366RB_PHY_ADDR_MAX 31 + +/* LED control registers */ +/* The LED blink rate is global; it is used by all triggers in all groups. */ +#define RTL8366RB_LED_BLINKRATE_REG 0x0430 +#define RTL8366RB_LED_BLINKRATE_MASK 0x0007 +#define RTL8366RB_LED_BLINKRATE_28MS 0x0000 +#define RTL8366RB_LED_BLINKRATE_56MS 0x0001 +#define RTL8366RB_LED_BLINKRATE_84MS 0x0002 +#define RTL8366RB_LED_BLINKRATE_111MS 0x0003 +#define RTL8366RB_LED_BLINKRATE_222MS 0x0004 +#define RTL8366RB_LED_BLINKRATE_446MS 0x0005 + +/* LED trigger event for each group */ +#define RTL8366RB_LED_CTRL_REG 0x0431 +#define RTL8366RB_LED_CTRL_OFFSET(led_group) \ + (4 * (led_group)) +#define RTL8366RB_LED_CTRL_MASK(led_group) \ + (0xf << RTL8366RB_LED_CTRL_OFFSET(led_group)) + +/* The RTL8366RB_LED_X_X registers are used to manually set the LED state only + * when the corresponding LED group in RTL8366RB_LED_CTRL_REG is + * RTL8366RB_LEDGROUP_FORCE. Otherwise, it is ignored. + */ +#define RTL8366RB_LED_0_1_CTRL_REG 0x0432 +#define RTL8366RB_LED_2_3_CTRL_REG 0x0433 +#define RTL8366RB_LED_X_X_CTRL_REG(led_group) \ + ((led_group) <= 1 ? \ + RTL8366RB_LED_0_1_CTRL_REG : \ + RTL8366RB_LED_2_3_CTRL_REG) +#define RTL8366RB_LED_0_X_CTRL_MASK GENMASK(5, 0) +#define RTL8366RB_LED_X_1_CTRL_MASK GENMASK(11, 6) +#define RTL8366RB_LED_2_X_CTRL_MASK GENMASK(5, 0) +#define RTL8366RB_LED_X_3_CTRL_MASK GENMASK(11, 6) + +enum rtl8366_ledgroup_mode { + RTL8366RB_LEDGROUP_OFF = 0x0, + RTL8366RB_LEDGROUP_DUP_COL = 0x1, + RTL8366RB_LEDGROUP_LINK_ACT = 0x2, + RTL8366RB_LEDGROUP_SPD1000 = 0x3, + RTL8366RB_LEDGROUP_SPD100 = 0x4, + RTL8366RB_LEDGROUP_SPD10 = 0x5, + RTL8366RB_LEDGROUP_SPD1000_ACT = 0x6, + RTL8366RB_LEDGROUP_SPD100_ACT = 0x7, + RTL8366RB_LEDGROUP_SPD10_ACT = 0x8, + RTL8366RB_LEDGROUP_SPD100_10_ACT = 0x9, + RTL8366RB_LEDGROUP_FIBER = 0xa, + RTL8366RB_LEDGROUP_AN_FAULT = 0xb, + RTL8366RB_LEDGROUP_LINK_RX = 0xc, + RTL8366RB_LEDGROUP_LINK_TX = 0xd, + RTL8366RB_LEDGROUP_MASTER = 0xe, + RTL8366RB_LEDGROUP_FORCE = 0xf, + + __RTL8366RB_LEDGROUP_MODE_MAX +}; + +#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS) + +struct rtl8366rb_led { + u8 port_num; + u8 led_group; + struct realtek_priv *priv; + struct led_classdev cdev; +}; + +int rtl8366rb_setup_leds(struct realtek_priv *priv); + +#else + +static inline int rtl8366rb_setup_leds(struct realtek_priv *priv) +{ + return 0; +} + +#endif /* IS_ENABLED(CONFIG_LEDS_CLASS) */ + +/** + * struct rtl8366rb - RTL8366RB-specific data + * @max_mtu: per-port max MTU setting + * @pvid_enabled: if PVID is set for respective port + * @leds: per-port and per-ledgroup led info + */ +struct rtl8366rb { + unsigned int max_mtu[RTL8366RB_NUM_PORTS]; + bool pvid_enabled[RTL8366RB_NUM_PORTS]; +#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS) + struct rtl8366rb_led leds[RTL8366RB_NUM_PORTS][RTL8366RB_NUM_LEDGROUPS]; +#endif +}; + +/* This code is used also with LEDs disabled */ +int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, + u8 led_group, + enum rtl8366_ledgroup_mode mode); + +#endif /* _RTL8366RB_H */ diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 5740c98d8c9f0..2847278d9cd48 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -1279,6 +1279,8 @@ struct macb { struct clk *rx_clk; struct clk *tsu_clk; struct net_device *dev; + /* Protects hw_stats and ethtool_stats */ + spinlock_t stats_lock; union { struct macb_stats macb; struct gem_stats gem; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 48496209fb164..c1f57d96e63fc 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1978,10 +1978,12 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id) if (status & MACB_BIT(ISR_ROVR)) { /* We missed at least one packet */ + spin_lock(&bp->stats_lock); if (macb_is_gem(bp)) bp->hw_stats.gem.rx_overruns++; else bp->hw_stats.macb.rx_overruns++; + spin_unlock(&bp->stats_lock); if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(ISR_ROVR)); @@ -3102,6 +3104,7 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) if (!netif_running(bp->dev)) return nstat; + spin_lock_irq(&bp->stats_lock); gem_update_stats(bp); nstat->rx_errors = (hwstat->rx_frame_check_sequence_errors + @@ -3131,6 +3134,7 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) nstat->tx_aborted_errors = hwstat->tx_excessive_collisions; nstat->tx_carrier_errors = hwstat->tx_carrier_sense_errors; nstat->tx_fifo_errors = hwstat->tx_underrun; + spin_unlock_irq(&bp->stats_lock); return nstat; } @@ -3138,12 +3142,13 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) static void gem_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { - struct macb *bp; + struct macb *bp = netdev_priv(dev); - bp = netdev_priv(dev); + spin_lock_irq(&bp->stats_lock); gem_update_stats(bp); memcpy(data, &bp->ethtool_stats, sizeof(u64) * (GEM_STATS_LEN + QUEUE_STATS_LEN * MACB_MAX_QUEUES)); + spin_unlock_irq(&bp->stats_lock); } static int gem_get_sset_count(struct net_device *dev, int sset) @@ -3193,6 +3198,7 @@ static struct net_device_stats *macb_get_stats(struct net_device *dev) return gem_get_stats(bp); /* read stats from hardware */ + spin_lock_irq(&bp->stats_lock); macb_update_stats(bp); /* Convert HW stats into netdevice stats */ @@ -3226,6 +3232,7 @@ static struct net_device_stats *macb_get_stats(struct net_device *dev) nstat->tx_carrier_errors = hwstat->tx_carrier_errors; nstat->tx_fifo_errors = hwstat->tx_underruns; /* Don't know about heartbeat or window errors... */ + spin_unlock_irq(&bp->stats_lock); return nstat; } @@ -5097,6 +5104,7 @@ static int macb_probe(struct platform_device *pdev) } } spin_lock_init(&bp->lock); + spin_lock_init(&bp->stats_lock); /* setup capabilities */ macb_configure_caps(bp, macb_config); diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 991e3839858b5..2b4bb74f21bf2 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1833,9 +1833,8 @@ static int gmac_open(struct net_device *netdev) gmac_enable_tx_rx(netdev); netif_tx_start_all_queues(netdev); - hrtimer_init(&port->rx_coalesce_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - port->rx_coalesce_timer.function = &gmac_coalesce_delay_expired; + hrtimer_setup(&port->rx_coalesce_timer, &gmac_coalesce_delay_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); netdev_dbg(netdev, "opened\n"); diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index 44af1d13d9317..67275aa4f65b2 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -416,8 +416,7 @@ static int ec_bhf_open(struct net_device *net_dev) netif_start_queue(net_dev); - hrtimer_init(&priv->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->hrtimer.function = ec_bhf_timer_fun; + hrtimer_setup(&priv->hrtimer, ec_bhf_timer_fun, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(&priv->hrtimer, polling_frequency, HRTIMER_MODE_REL); return 0; diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index e48b861e4ce15..270ff9aab3352 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -562,7 +562,7 @@ struct be_adapter { struct be_dma_mem mbox_mem_alloced; struct be_mcc_obj mcc_obj; - struct mutex mcc_lock; /* For serializing mcc cmds to BE card */ + spinlock_t mcc_lock; /* For serializing mcc cmds to BE card */ spinlock_t mcc_cq_lock; u16 cfg_num_rx_irqs; /* configured via set-channels */ diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 61adcebeef010..51b8377edd1d0 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -575,7 +575,7 @@ int be_process_mcc(struct be_adapter *adapter) /* Wait till no more pending mcc requests are present */ static int be_mcc_wait_compl(struct be_adapter *adapter) { -#define mcc_timeout 12000 /* 12s timeout */ +#define mcc_timeout 120000 /* 12s timeout */ int i, status = 0; struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; @@ -589,7 +589,7 @@ static int be_mcc_wait_compl(struct be_adapter *adapter) if (atomic_read(&mcc_obj->q.used) == 0) break; - usleep_range(500, 1000); + udelay(100); } if (i == mcc_timeout) { dev_err(&adapter->pdev->dev, "FW not responding\n"); @@ -866,7 +866,7 @@ static bool use_mcc(struct be_adapter *adapter) static int be_cmd_lock(struct be_adapter *adapter) { if (use_mcc(adapter)) { - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); return 0; } else { return mutex_lock_interruptible(&adapter->mbox_lock); @@ -877,7 +877,7 @@ static int be_cmd_lock(struct be_adapter *adapter) static void be_cmd_unlock(struct be_adapter *adapter) { if (use_mcc(adapter)) - return mutex_unlock(&adapter->mcc_lock); + return spin_unlock_bh(&adapter->mcc_lock); else return mutex_unlock(&adapter->mbox_lock); } @@ -1047,7 +1047,7 @@ int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr, struct be_cmd_req_mac_query *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1076,7 +1076,7 @@ int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1088,7 +1088,7 @@ int be_cmd_pmac_add(struct be_adapter *adapter, const u8 *mac_addr, struct be_cmd_req_pmac_add *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1113,7 +1113,7 @@ int be_cmd_pmac_add(struct be_adapter *adapter, const u8 *mac_addr, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); if (base_status(status) == MCC_STATUS_UNAUTHORIZED_REQUEST) status = -EPERM; @@ -1131,7 +1131,7 @@ int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, u32 dom) if (pmac_id == -1) return 0; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1151,7 +1151,7 @@ int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, u32 dom) status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1414,7 +1414,7 @@ int be_cmd_rxq_create(struct be_adapter *adapter, struct be_dma_mem *q_mem = &rxq->dma_mem; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1444,7 +1444,7 @@ int be_cmd_rxq_create(struct be_adapter *adapter, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1508,7 +1508,7 @@ int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q) struct be_cmd_req_q_destroy *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1525,7 +1525,7 @@ int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q) q->created = false; err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1593,7 +1593,7 @@ int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd) struct be_cmd_req_hdr *hdr; int status = 0; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1621,7 +1621,7 @@ int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd) adapter->stats_cmd_sent = true; err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1637,7 +1637,7 @@ int lancer_cmd_get_pport_stats(struct be_adapter *adapter, CMD_SUBSYSTEM_ETH)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1660,7 +1660,7 @@ int lancer_cmd_get_pport_stats(struct be_adapter *adapter, adapter->stats_cmd_sent = true; err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1697,7 +1697,7 @@ int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed, struct be_cmd_req_link_status *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); if (link_status) *link_status = LINK_DOWN; @@ -1736,7 +1736,7 @@ int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1747,7 +1747,7 @@ int be_cmd_get_die_temperature(struct be_adapter *adapter) struct be_cmd_req_get_cntl_addnl_attribs *req; int status = 0; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1762,7 +1762,7 @@ int be_cmd_get_die_temperature(struct be_adapter *adapter) status = be_mcc_notify(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1811,7 +1811,7 @@ int be_cmd_get_fat_dump(struct be_adapter *adapter, u32 buf_len, void *buf) if (!get_fat_cmd.va) return -ENOMEM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); while (total_size) { buf_size = min(total_size, (u32)60 * 1024); @@ -1849,9 +1849,9 @@ int be_cmd_get_fat_dump(struct be_adapter *adapter, u32 buf_len, void *buf) log_offset += buf_size; } err: + spin_unlock_bh(&adapter->mcc_lock); dma_free_coherent(&adapter->pdev->dev, get_fat_cmd.size, get_fat_cmd.va, get_fat_cmd.dma); - mutex_unlock(&adapter->mcc_lock); return status; } @@ -1862,7 +1862,7 @@ int be_cmd_get_fw_ver(struct be_adapter *adapter) struct be_cmd_req_get_fw_version *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1885,7 +1885,7 @@ int be_cmd_get_fw_ver(struct be_adapter *adapter) sizeof(adapter->fw_on_flash)); } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1899,7 +1899,7 @@ static int __be_cmd_modify_eqd(struct be_adapter *adapter, struct be_cmd_req_modify_eq_delay *req; int status = 0, i; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1922,7 +1922,7 @@ static int __be_cmd_modify_eqd(struct be_adapter *adapter, status = be_mcc_notify(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1949,7 +1949,7 @@ int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array, struct be_cmd_req_vlan_config *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -1971,7 +1971,7 @@ int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -1982,7 +1982,7 @@ static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value) struct be_cmd_req_rx_filter *req = mem->va; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2015,7 +2015,7 @@ static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value) status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2046,7 +2046,7 @@ int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc) CMD_SUBSYSTEM_COMMON)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2066,7 +2066,7 @@ int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc) status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); if (base_status(status) == MCC_STATUS_FEATURE_NOT_SUPPORTED) return -EOPNOTSUPP; @@ -2085,7 +2085,7 @@ int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc) CMD_SUBSYSTEM_COMMON)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2108,7 +2108,7 @@ int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc) } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2189,7 +2189,7 @@ int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, if (!(be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS)) return 0; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2214,7 +2214,7 @@ int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2226,7 +2226,7 @@ int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, struct be_cmd_req_enable_disable_beacon *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2247,7 +2247,7 @@ int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2258,7 +2258,7 @@ int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state) struct be_cmd_req_get_beacon_state *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2282,7 +2282,7 @@ int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state) } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2306,7 +2306,7 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, return -ENOMEM; } - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2328,7 +2328,7 @@ int be_cmd_read_port_transceiver_data(struct be_adapter *adapter, memcpy(data, resp->page_data + off, len); } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma); return status; } @@ -2345,7 +2345,7 @@ static int lancer_cmd_write_object(struct be_adapter *adapter, void *ctxt = NULL; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); adapter->flash_status = 0; wrb = wrb_from_mccq(adapter); @@ -2387,7 +2387,7 @@ static int lancer_cmd_write_object(struct be_adapter *adapter, if (status) goto err_unlock; - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); if (!wait_for_completion_timeout(&adapter->et_cmd_compl, msecs_to_jiffies(60000))) @@ -2406,7 +2406,7 @@ static int lancer_cmd_write_object(struct be_adapter *adapter, return status; err_unlock: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2460,7 +2460,7 @@ static int lancer_cmd_delete_object(struct be_adapter *adapter, struct be_mcc_wrb *wrb; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2478,7 +2478,7 @@ static int lancer_cmd_delete_object(struct be_adapter *adapter, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2491,7 +2491,7 @@ int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, struct lancer_cmd_resp_read_object *resp; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2525,7 +2525,7 @@ int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd, } err_unlock: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2537,7 +2537,7 @@ static int be_cmd_write_flashrom(struct be_adapter *adapter, struct be_cmd_write_flashrom *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); adapter->flash_status = 0; wrb = wrb_from_mccq(adapter); @@ -2562,7 +2562,7 @@ static int be_cmd_write_flashrom(struct be_adapter *adapter, if (status) goto err_unlock; - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); if (!wait_for_completion_timeout(&adapter->et_cmd_compl, msecs_to_jiffies(40000))) @@ -2573,7 +2573,7 @@ static int be_cmd_write_flashrom(struct be_adapter *adapter, return status; err_unlock: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -2584,7 +2584,7 @@ static int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, struct be_mcc_wrb *wrb; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -2611,7 +2611,7 @@ static int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, memcpy(flashed_crc, req->crc, 4); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3217,7 +3217,7 @@ int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac, struct be_cmd_req_acpi_wol_magic_config *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3234,7 +3234,7 @@ int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3249,7 +3249,7 @@ int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num, CMD_SUBSYSTEM_LOWLEVEL)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3272,7 +3272,7 @@ int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num, if (status) goto err_unlock; - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); if (!wait_for_completion_timeout(&adapter->et_cmd_compl, msecs_to_jiffies(SET_LB_MODE_TIMEOUT))) @@ -3281,7 +3281,7 @@ int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num, return status; err_unlock: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3298,7 +3298,7 @@ int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num, CMD_SUBSYSTEM_LOWLEVEL)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3324,7 +3324,7 @@ int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num, if (status) goto err; - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); wait_for_completion(&adapter->et_cmd_compl); resp = embedded_payload(wrb); @@ -3332,7 +3332,7 @@ int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num, return status; err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3348,7 +3348,7 @@ int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, CMD_SUBSYSTEM_LOWLEVEL)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3382,7 +3382,7 @@ int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3393,7 +3393,7 @@ int be_cmd_get_seeprom_data(struct be_adapter *adapter, struct be_cmd_req_seeprom_read *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3409,7 +3409,7 @@ int be_cmd_get_seeprom_data(struct be_adapter *adapter, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3424,7 +3424,7 @@ int be_cmd_get_phy_info(struct be_adapter *adapter) CMD_SUBSYSTEM_COMMON)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3469,7 +3469,7 @@ int be_cmd_get_phy_info(struct be_adapter *adapter) } dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3479,7 +3479,7 @@ static int be_cmd_set_qos(struct be_adapter *adapter, u32 bps, u32 domain) struct be_cmd_req_set_qos *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3499,7 +3499,7 @@ static int be_cmd_set_qos(struct be_adapter *adapter, u32 bps, u32 domain) status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3611,7 +3611,7 @@ int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege, struct be_cmd_req_get_fn_privileges *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3643,7 +3643,7 @@ int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3655,7 +3655,7 @@ int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges, struct be_cmd_req_set_fn_privileges *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3675,7 +3675,7 @@ int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3707,7 +3707,7 @@ int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac, return -ENOMEM; } - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3771,7 +3771,7 @@ int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac, } out: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); dma_free_coherent(&adapter->pdev->dev, get_mac_list_cmd.size, get_mac_list_cmd.va, get_mac_list_cmd.dma); return status; @@ -3831,7 +3831,7 @@ int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, if (!cmd.va) return -ENOMEM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3853,7 +3853,7 @@ int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, err: dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma); - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3889,7 +3889,7 @@ int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, CMD_SUBSYSTEM_COMMON)) return -EPERM; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3930,7 +3930,7 @@ int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -3944,7 +3944,7 @@ int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, int status; u16 vid; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -3991,7 +3991,7 @@ int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -4190,7 +4190,7 @@ int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter, struct be_cmd_req_set_ext_fat_caps *req; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -4206,7 +4206,7 @@ int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -4684,7 +4684,7 @@ int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op) if (iface == 0xFFFFFFFF) return -1; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -4701,7 +4701,7 @@ int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op) status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -4735,7 +4735,7 @@ int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg, struct be_cmd_resp_get_iface_list *resp; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -4756,7 +4756,7 @@ int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg, } err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -4850,7 +4850,7 @@ int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain) if (BEx_chip(adapter)) return 0; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -4868,7 +4868,7 @@ int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain) req->enable = 1; status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -4941,7 +4941,7 @@ __be_cmd_set_logical_link_config(struct be_adapter *adapter, u32 link_config = 0; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -4969,7 +4969,7 @@ __be_cmd_set_logical_link_config(struct be_adapter *adapter, status = be_mcc_notify_wait(adapter); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -5000,8 +5000,7 @@ int be_cmd_set_features(struct be_adapter *adapter) struct be_mcc_wrb *wrb; int status; - if (mutex_lock_interruptible(&adapter->mcc_lock)) - return -1; + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -5039,7 +5038,7 @@ int be_cmd_set_features(struct be_adapter *adapter) dev_info(&adapter->pdev->dev, "Adapter does not support HW error recovery\n"); - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } @@ -5053,7 +5052,7 @@ int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload, struct be_cmd_resp_hdr *resp; int status; - mutex_lock(&adapter->mcc_lock); + spin_lock_bh(&adapter->mcc_lock); wrb = wrb_from_mccq(adapter); if (!wrb) { @@ -5076,7 +5075,7 @@ int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload, memcpy(wrb_payload, resp, sizeof(*resp) + resp->response_length); be_dws_le_to_cpu(wrb_payload, sizeof(*resp) + resp->response_length); err: - mutex_unlock(&adapter->mcc_lock); + spin_unlock_bh(&adapter->mcc_lock); return status; } EXPORT_SYMBOL(be_roce_mcc_cmd); diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 875fe379eea21..3d2e215921191 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5667,8 +5667,8 @@ static int be_drv_init(struct be_adapter *adapter) } mutex_init(&adapter->mbox_lock); - mutex_init(&adapter->mcc_lock); mutex_init(&adapter->rx_filter_lock); + spin_lock_init(&adapter->mcc_lock); spin_lock_init(&adapter->mcc_cq_lock); init_completion(&adapter->et_cmd_compl); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 6a6fc819dfdee..2106861463e40 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -167,6 +167,24 @@ static bool enetc_skb_is_tcp(struct sk_buff *skb) return skb->csum_offset == offsetof(struct tcphdr, check); } +/** + * enetc_unwind_tx_frame() - Unwind the DMA mappings of a multi-buffer Tx frame + * @tx_ring: Pointer to the Tx ring on which the buffer descriptors are located + * @count: Number of Tx buffer descriptors which need to be unmapped + * @i: Index of the last successfully mapped Tx buffer descriptor + */ +static void enetc_unwind_tx_frame(struct enetc_bdr *tx_ring, int count, int i) +{ + while (count--) { + struct enetc_tx_swbd *tx_swbd = &tx_ring->tx_swbd[i]; + + enetc_free_tx_frame(tx_ring, tx_swbd); + if (i == 0) + i = tx_ring->bd_count; + i--; + } +} + static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) { bool do_vlan, do_onestep_tstamp = false, do_twostep_tstamp = false; @@ -279,9 +297,11 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) } if (do_onestep_tstamp) { - u32 lo, hi, val; - u64 sec, nsec; + __be32 new_sec_l, new_nsec; + u32 lo, hi, nsec, val; + __be16 new_sec_h; u8 *data; + u64 sec; lo = enetc_rd_hot(hw, ENETC_SICTR0); hi = enetc_rd_hot(hw, ENETC_SICTR1); @@ -295,13 +315,38 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) /* Update originTimestamp field of Sync packet * - 48 bits seconds field * - 32 bits nanseconds field + * + * In addition, the UDP checksum needs to be updated + * by software after updating originTimestamp field, + * otherwise the hardware will calculate the wrong + * checksum when updating the correction field and + * update it to the packet. */ data = skb_mac_header(skb); - *(__be16 *)(data + offset2) = - htons((sec >> 32) & 0xffff); - *(__be32 *)(data + offset2 + 2) = - htonl(sec & 0xffffffff); - *(__be32 *)(data + offset2 + 6) = htonl(nsec); + new_sec_h = htons((sec >> 32) & 0xffff); + new_sec_l = htonl(sec & 0xffffffff); + new_nsec = htonl(nsec); + if (udp) { + struct udphdr *uh = udp_hdr(skb); + __be32 old_sec_l, old_nsec; + __be16 old_sec_h; + + old_sec_h = *(__be16 *)(data + offset2); + inet_proto_csum_replace2(&uh->check, skb, old_sec_h, + new_sec_h, false); + + old_sec_l = *(__be32 *)(data + offset2 + 2); + inet_proto_csum_replace4(&uh->check, skb, old_sec_l, + new_sec_l, false); + + old_nsec = *(__be32 *)(data + offset2 + 6); + inet_proto_csum_replace4(&uh->check, skb, old_nsec, + new_nsec, false); + } + + *(__be16 *)(data + offset2) = new_sec_h; + *(__be32 *)(data + offset2 + 2) = new_sec_l; + *(__be32 *)(data + offset2 + 6) = new_nsec; /* Configure single-step register */ val = ENETC_PM0_SINGLE_STEP_EN; @@ -372,25 +417,20 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) dma_err: dev_err(tx_ring->dev, "DMA map error"); - do { - tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_frame(tx_ring, tx_swbd); - if (i == 0) - i = tx_ring->bd_count; - i--; - } while (count--); + enetc_unwind_tx_frame(tx_ring, count, i); return 0; } -static void enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, - struct enetc_tx_swbd *tx_swbd, - union enetc_tx_bd *txbd, int *i, int hdr_len, - int data_len) +static int enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, + struct enetc_tx_swbd *tx_swbd, + union enetc_tx_bd *txbd, int *i, int hdr_len, + int data_len) { union enetc_tx_bd txbd_tmp; u8 flags = 0, e_flags = 0; dma_addr_t addr; + int count = 1; enetc_clear_tx_bd(&txbd_tmp); addr = tx_ring->tso_headers_dma + *i * TSO_HEADER_SIZE; @@ -433,7 +473,10 @@ static void enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, /* Write the BD */ txbd_tmp.ext.e_flags = e_flags; *txbd = txbd_tmp; + count++; } + + return count; } static int enetc_map_tx_tso_data(struct enetc_bdr *tx_ring, struct sk_buff *skb, @@ -790,9 +833,9 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb /* compute the csum over the L4 header */ csum = enetc_tso_hdr_csum(&tso, skb, hdr, hdr_len, &pos); - enetc_map_tx_tso_hdr(tx_ring, skb, tx_swbd, txbd, &i, hdr_len, data_len); + count += enetc_map_tx_tso_hdr(tx_ring, skb, tx_swbd, txbd, + &i, hdr_len, data_len); bd_data_num = 0; - count++; while (data_len > 0) { int size; @@ -816,8 +859,13 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb err = enetc_map_tx_tso_data(tx_ring, skb, tx_swbd, txbd, tso.data, size, size == data_len); - if (err) + if (err) { + if (i == 0) + i = tx_ring->bd_count; + i--; + goto err_map_data; + } data_len -= size; count++; @@ -846,13 +894,7 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb dev_err(tx_ring->dev, "DMA map error"); err_chained_bd: - do { - tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_frame(tx_ring, tx_swbd); - if (i == 0) - i = tx_ring->bd_count; - i--; - } while (count--); + enetc_unwind_tx_frame(tx_ring, count, i); return 0; } @@ -1901,7 +1943,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, enetc_xdp_drop(rx_ring, orig_i, i); tx_ring->stats.xdp_tx_drops++; } else { - tx_ring->stats.xdp_tx += xdp_tx_bd_cnt; + tx_ring->stats.xdp_tx++; rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; xdp_tx_frm_cnt++; /* The XDP_TX enqueue was successful, so we @@ -3228,6 +3270,9 @@ static int enetc_hwtstamp_set(struct net_device *ndev, struct ifreq *ifr) new_offloads |= ENETC_F_TX_TSTAMP; break; case HWTSTAMP_TX_ONESTEP_SYNC: + if (!enetc_si_is_pf(priv->si)) + return -EOPNOTSUPP; + new_offloads &= ~ENETC_F_TX_TSTAMP_MASK; new_offloads |= ENETC_F_TX_ONESTEP_SYNC_TSTAMP; break; diff --git a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c index fc41078c4f5da..73ac8c6afb3ad 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c @@ -672,7 +672,6 @@ static int enetc4_pf_netdev_create(struct enetc_si *si) err_alloc_msix: err_config_si: err_clk_get: - mutex_destroy(&priv->mm_lock); free_netdev(ndev); return err; @@ -684,6 +683,7 @@ static void enetc4_pf_netdev_destroy(struct enetc_si *si) struct net_device *ndev = si->ndev; unregister_netdev(ndev); + enetc4_link_deinit(priv); enetc_free_msix(priv); free_netdev(ndev); } diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index bf34b5bb1e358..ece3ae28ba827 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -832,6 +832,7 @@ static int enetc_set_coalesce(struct net_device *ndev, static int enetc_get_ts_info(struct net_device *ndev, struct kernel_ethtool_ts_info *info) { + struct enetc_ndev_priv *priv = netdev_priv(ndev); int *phc_idx; phc_idx = symbol_get(enetc_phc_index); @@ -852,8 +853,10 @@ static int enetc_get_ts_info(struct net_device *ndev, SOF_TIMESTAMPING_TX_SOFTWARE; info->tx_types = (1 << HWTSTAMP_TX_OFF) | - (1 << HWTSTAMP_TX_ON) | - (1 << HWTSTAMP_TX_ONESTEP_SYNC); + (1 << HWTSTAMP_TX_ON); + + if (enetc_si_is_pf(priv->si)) + info->tx_types |= (1 << HWTSTAMP_TX_ONESTEP_SYNC); info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) | (1 << HWTSTAMP_FILTER_ALL); diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 7f6b574320716..fe4e7f99b6a3a 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -739,8 +739,8 @@ void fec_ptp_init(struct platform_device *pdev, int irq_idx) INIT_DELAYED_WORK(&fep->time_keep, fec_time_keep); - hrtimer_init(&fep->perout_timer, CLOCK_REALTIME, HRTIMER_MODE_REL); - fep->perout_timer.function = fec_ptp_pps_perout_handler; + hrtimer_setup(&fep->perout_timer, fec_ptp_pps_perout_handler, CLOCK_REALTIME, + HRTIMER_MODE_REL); irq = platform_get_irq_byname_optional(pdev, "pps"); if (irq < 0) diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 8ac0047f1ada1..f0674a4435670 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -109,10 +109,12 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_rx_ring *rx = &priv->rx[idx]; if (!gve_rx_was_added_to_block(priv, idx)) return; + page_pool_disable_direct_recycling(rx->dqo.page_pool); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index a376d4bdf2819..18376bcc718a2 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -934,8 +934,6 @@ static int hip04_mac_probe(struct platform_device *pdev) priv->chan = arg.args[1] * RX_DESC_NUM; priv->group = arg.args[2]; - hrtimer_init(&priv->tx_coalesce_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - /* BQL will try to keep the TX queue as short as possible, but it can't * be faster than tx_coalesce_usecs, so we need a fast timeout here, * but also long enough to gather up enough frames to ensure we don't @@ -944,7 +942,7 @@ static int hip04_mac_probe(struct platform_device *pdev) */ priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4; priv->tx_coalesce_usecs = 200; - priv->tx_coalesce_timer.function = tx_done; + hrtimer_setup(&priv->tx_coalesce_timer, tx_done, CLOCK_MONOTONIC, HRTIMER_MODE_REL); priv->map = syscon_node_to_regmap(arg.np); of_node_put(arg.np); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index bab16c2191b2f..181af419b878d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -483,7 +483,7 @@ int hclge_ptp_init(struct hclge_dev *hdev) ret = hclge_ptp_get_cycle(hdev); if (ret) - return ret; + goto out; } ret = hclge_ptp_int_en(hdev, true); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 852e5b62f0a5d..6faa62bced3a2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1983,7 +1983,7 @@ static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter, bool runni static void iavf_finish_config(struct work_struct *work) { struct iavf_adapter *adapter; - bool netdev_released = false; + bool locks_released = false; int pairs, err; adapter = container_of(work, struct iavf_adapter, finish_config); @@ -2012,19 +2012,22 @@ static void iavf_finish_config(struct work_struct *work) netif_set_real_num_tx_queues(adapter->netdev, pairs); if (adapter->netdev->reg_state != NETREG_REGISTERED) { + mutex_unlock(&adapter->crit_lock); netdev_unlock(adapter->netdev); - netdev_released = true; + locks_released = true; err = register_netdevice(adapter->netdev); if (err) { dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n", err); /* go back and try again.*/ + mutex_lock(&adapter->crit_lock); iavf_free_rss(adapter); iavf_free_misc_irq(adapter); iavf_reset_interrupt_capability(adapter); iavf_change_state(adapter, __IAVF_INIT_CONFIG_ADAPTER); + mutex_unlock(&adapter->crit_lock); goto out; } } @@ -2040,9 +2043,10 @@ static void iavf_finish_config(struct work_struct *work) } out: - mutex_unlock(&adapter->crit_lock); - if (!netdev_released) + if (!locks_released) { + mutex_unlock(&adapter->crit_lock); netdev_unlock(adapter->netdev); + } rtnl_unlock(); } diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index fb527434b58b1..d649c197cf673 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -38,8 +38,7 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) if (ice_vsi_add_vlan_zero(uplink_vsi)) goto err_vlan_zero; - if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, - ICE_FLTR_RX)) + if (ice_set_dflt_vsi(uplink_vsi)) goto err_def_rx; if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index b83f99c01d91b..8aabf7749aa5e 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -36,6 +36,7 @@ static void ice_free_vf_entries(struct ice_pf *pf) hash_for_each_safe(vfs->table, bkt, tmp, vf, entry) { hash_del_rcu(&vf->entry); + ice_deinitialize_vf_entry(vf); ice_put_vf(vf); } } @@ -193,10 +194,6 @@ void ice_free_vfs(struct ice_pf *pf) wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx)); } - /* clear malicious info since the VF is getting released */ - if (!ice_is_feature_supported(pf, ICE_F_MBX_LIMIT)) - list_del(&vf->mbx_info.list_entry); - mutex_unlock(&vf->cfg_lock); } diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index c7c0c2f50c265..815ad0bfe8326 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -1036,6 +1036,14 @@ void ice_initialize_vf_entry(struct ice_vf *vf) mutex_init(&vf->cfg_lock); } +void ice_deinitialize_vf_entry(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + + if (!ice_is_feature_supported(pf, ICE_F_MBX_LIMIT)) + list_del(&vf->mbx_info.list_entry); +} + /** * ice_dis_vf_qs - Disable the VF queues * @vf: pointer to the VF structure diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h b/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h index 0c7e77c0a09fa..5392b04049862 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h @@ -24,6 +24,7 @@ #endif void ice_initialize_vf_entry(struct ice_vf *vf); +void ice_deinitialize_vf_entry(struct ice_vf *vf); void ice_dis_vf_qs(struct ice_vf *vf); int ice_check_vf_init(struct ice_vf *vf); enum virtchnl_status_code ice_err_to_virt_err(int err); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 9be6a6b59c4e1..977741c414980 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3013,7 +3013,6 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_size = rsc_seg_len; skb_reset_network_header(skb); - len = skb->len - skb_transport_offset(skb); if (ipv4) { struct iphdr *ipv4h = ip_hdr(skb); @@ -3022,6 +3021,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, /* Reset and set transport header offset in skb */ skb_set_transport_header(skb, sizeof(struct iphdr)); + len = skb->len - skb_transport_offset(skb); /* Compute the TCP pseudo header checksum*/ tcp_hdr(skb)->check = @@ -3031,6 +3031,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + len = skb->len - skb_transport_offset(skb); tcp_hdr(skb)->check = ~tcp_v6_check(len, &ipv6h->saddr, &ipv6h->daddr, 0); } diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 84307bb7313e0..733820a0c3508 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7090,8 +7090,8 @@ static int igc_probe(struct pci_dev *pdev, INIT_WORK(&adapter->reset_task, igc_reset_task); INIT_WORK(&adapter->watchdog_task, igc_watchdog_task); - hrtimer_init(&adapter->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - adapter->hrtimer.function = &igc_qbv_scheduling_timer; + hrtimer_setup(&adapter->hrtimer, &igc_qbv_scheduling_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Initialize link properties that are user-changeable */ adapter->fc_autoneg = true; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 683c668672d65..cb07ecd8937d3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1122,7 +1122,7 @@ static bool ixgbe_is_media_cage_present(struct ixgbe_hw *hw) * returns error (ENOENT), then no cage present. If no cage present then * connection type is backplane or BASE-T. */ - return ixgbe_aci_get_netlist_node(hw, cmd, NULL, NULL); + return !ixgbe_aci_get_netlist_node(hw, cmd, NULL, NULL); } /** diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 1641791a2d5b4..8ed83fb988624 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -324,7 +324,7 @@ static const struct mvpp2_cls_flow cls_flows[MVPP2_N_PRS_FLOWS] = { MVPP2_PRS_RI_VLAN_MASK), /* Non IP flow, with vlan tag */ MVPP2_DEF_FLOW(MVPP22_FLOW_ETHERNET, MVPP2_FL_NON_IP_TAG, - MVPP22_CLS_HEK_OPT_VLAN, + MVPP22_CLS_HEK_TAGGED, 0, 0), }; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index dd76c1b7ed3a1..3c7b43712d25f 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6985,9 +6985,8 @@ static int mvpp2_port_probe(struct platform_device *pdev, for (thread = 0; thread < priv->nthreads; thread++) { port_pcpu = per_cpu_ptr(port->pcpu, thread); - hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb; + hrtimer_setup(&port_pcpu->tx_done_timer, mvpp2_hr_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); port_pcpu->timer_scheduled = false; port_pcpu->dev = dev; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c index bcc96eed24813..66749b3649c1f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c @@ -545,8 +545,7 @@ static int ptp_probe(struct pci_dev *pdev, spin_lock_init(&ptp->ptp_lock); if (cn10k_ptp_errata(ptp)) { ptp->read_ptp_tstmp = &read_ptp_tstmp_sec_nsec; - hrtimer_init(&ptp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ptp->hrtimer.function = ptp_reset_thresh; + hrtimer_setup(&ptp->hrtimer, ptp_reset_thresh, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } else { ptp->read_ptp_tstmp = &read_ptp_tstmp_nsec; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b7c843446e11..823c1ba456cd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -564,6 +564,9 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_ return err; esw_qos_normalize_min_rate(parent->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, + vport->qos.sched_node->max_rate, + vport->qos.sched_node->bw_share); return 0; } @@ -591,8 +594,11 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t sched_node->vport = vport; vport->qos.sched_node = sched_node; err = esw_qos_vport_enable(vport, parent, extack); - if (err) + if (err) { + __esw_qos_free_node(sched_node); esw_qos_put(esw); + vport->qos.sched_node = NULL; + } return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 7db9cab9bedf6..d9362eabc6a1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -572,7 +572,7 @@ irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name, pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ; pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ; mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d", - name, size, start); + name ? name : "mlx5_pcif_pool", size, start); return pool; } diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index 138ac58fae51a..f713656f1fae1 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -375,6 +375,6 @@ irqreturn_t sparx5_xtr_handler(int irq, void *_sparx5) void sparx5_port_inj_timer_setup(struct sparx5_port *port) { - hrtimer_init(&port->inj_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->inj_timer.function = sparx5_injection_timeout; + hrtimer_setup(&port->inj_timer, sparx5_injection_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index a5e3d1a88305e..8b4640c5d61eb 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -686,8 +686,8 @@ void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u32 size, void rmnet_map_tx_aggregate_init(struct rmnet_port *port) { - hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->hrtimer.function = rmnet_map_flush_tx_packet_queue; + hrtimer_setup(&port->hrtimer, rmnet_map_flush_tx_packet_queue, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); spin_lock_init(&port->agg_lock); rmnet_map_update_ul_agg_config(port, 4096, 1, 800); INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index bfe6e2d631bdf..ab7c2750c1042 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -11,6 +11,8 @@ #include "dwmac_dma.h" #include "dwmac1000.h" +#define DRIVER_NAME "dwmac-loongson-pci" + /* Normal Loongson Tx Summary */ #define DMA_INTR_ENA_NIE_TX_LOONGSON 0x00040000 /* Normal Loongson Rx Summary */ @@ -516,6 +518,19 @@ static int loongson_dwmac_acpi_config(struct pci_dev *pdev, return 0; } +/* Loongson's DWMAC device may take nearly two seconds to complete DMA reset */ +static int loongson_dwmac_fix_reset(void *priv, void __iomem *ioaddr) +{ + u32 value = readl(ioaddr + DMA_BUS_MODE); + + value |= DMA_BUS_MODE_SFT_RESET; + writel(value, ioaddr + DMA_BUS_MODE); + + return readl_poll_timeout(ioaddr + DMA_BUS_MODE, value, + !(value & DMA_BUS_MODE_SFT_RESET), + 10000, 2000000); +} + static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct plat_stmmacenet_data *plat; @@ -555,7 +570,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (pci_resource_len(pdev, i) == 0) continue; - ret = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev)); + ret = pcim_iomap_regions(pdev, BIT(0), DRIVER_NAME); if (ret) goto err_disable_device; break; @@ -566,6 +581,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id plat->bsp_priv = ld; plat->setup = loongson_dwmac_setup; + plat->fix_soc_reset = loongson_dwmac_fix_reset; ld->dev = &pdev->dev; ld->loongson_id = readl(res.addr + GMAC_VERSION) & 0xff; @@ -673,7 +689,7 @@ static const struct pci_device_id loongson_dwmac_id_table[] = { MODULE_DEVICE_TABLE(pci, loongson_dwmac_id_table); static struct pci_driver loongson_dwmac_driver = { - .name = "dwmac-loongson-pci", + .name = DRIVER_NAME, .id_table = loongson_dwmac_id_table, .probe = loongson_dwmac_probe, .remove = loongson_dwmac_remove, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c0ae7db96f46f..554d2c0a8fde0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3199,8 +3199,7 @@ static void stmmac_init_coalesce(struct stmmac_priv *priv) priv->tx_coal_frames[chan] = STMMAC_TX_FRAMES; priv->tx_coal_timer[chan] = STMMAC_COAL_TX_TIMER; - hrtimer_init(&tx_q->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx_q->txtimer.function = stmmac_tx_timer; + hrtimer_setup(&tx_q->txtimer, stmmac_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } for (chan = 0; chan < rx_channel_count; chan++) @@ -6970,8 +6969,7 @@ int stmmac_xdp_open(struct net_device *dev) stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, chan); - hrtimer_init(&tx_q->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx_q->txtimer.function = stmmac_tx_timer; + hrtimer_setup(&tx_q->txtimer, stmmac_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } /* Enable the MAC Rx/Tx */ diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 0d5a862cd78a6..3a13d60a947a8 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -99,6 +99,7 @@ config TI_K3_AM65_CPSW_NUSS select NET_DEVLINK select TI_DAVINCI_MDIO select PHYLINK + select PAGE_POOL select TI_K3_CPPI_DESC_POOL imply PHY_TI_GMII_SEL depends on TI_K3_AM65_CPTS || !TI_K3_AM65_CPTS diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2806238629f82..361169db4befb 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2311,8 +2311,8 @@ static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common) for (i = 0; i < common->tx_ch_num; i++) { struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; - hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - tx_chn->tx_hrtimer.function = &am65_cpsw_nuss_tx_timer_callback; + hrtimer_setup(&tx_chn->tx_hrtimer, &am65_cpsw_nuss_tx_timer_callback, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); ret = devm_request_irq(dev, tx_chn->irq, am65_cpsw_nuss_tx_irq, @@ -2565,9 +2565,8 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) snprintf(flow->name, sizeof(flow->name), "%s-rx%d", dev_name(dev), i); - hrtimer_init(&flow->rx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - flow->rx_hrtimer.function = &am65_cpsw_nuss_rx_timer_callback; + hrtimer_setup(&flow->rx_hrtimer, &am65_cpsw_nuss_rx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); ret = devm_request_irq(dev, flow->irq, am65_cpsw_nuss_rx_irq, diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 768578c0d9587..d59c1744840af 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -474,26 +474,7 @@ static int icss_iep_perout_enable_hw(struct icss_iep *iep, static int icss_iep_perout_enable(struct icss_iep *iep, struct ptp_perout_request *req, int on) { - int ret = 0; - - mutex_lock(&iep->ptp_clk_mutex); - - if (iep->pps_enabled) { - ret = -EBUSY; - goto exit; - } - - if (iep->perout_enabled == !!on) - goto exit; - - ret = icss_iep_perout_enable_hw(iep, req, on); - if (!ret) - iep->perout_enabled = !!on; - -exit: - mutex_unlock(&iep->ptp_clk_mutex); - - return ret; + return -EOPNOTSUPP; } static void icss_iep_cap_cmp_work(struct work_struct *work) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index 74f0f200a89d4..6c1b8ff563e06 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -249,9 +249,8 @@ int prueth_ndev_add_tx_napi(struct prueth_emac *emac) struct prueth_tx_chn *tx_chn = &emac->tx_chns[i]; netif_napi_add_tx(emac->ndev, &tx_chn->napi_tx, emac_napi_tx_poll); - hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - tx_chn->tx_hrtimer.function = &emac_tx_timer_callback; + hrtimer_setup(&tx_chn->tx_hrtimer, &emac_tx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); ret = request_irq(tx_chn->irq, prueth_tx_irq, IRQF_TRIGGER_HIGH, tx_chn->name, tx_chn); diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 00ed978605478..d3bdde6bd3aac 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -1169,9 +1169,8 @@ static int prueth_netdev_init(struct prueth *prueth, ndev->hw_features |= NETIF_PRUETH_HSR_OFFLOAD_FEATURES; netif_napi_add(ndev, &emac->napi_rx, icssg_napi_rx_poll); - hrtimer_init(&emac->rx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - emac->rx_hrtimer.function = &emac_rx_timer_callback; + hrtimer_setup(&emac->rx_hrtimer, &emac_rx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); prueth->emac[mac] = emac; return 0; diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index f632b0cfd5ae0..fd91f8a45bce8 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -776,8 +776,8 @@ at86rf230_setup_spi_messages(struct at86rf230_local *lp, state->trx.tx_buf = state->buf; state->trx.rx_buf = state->buf; spi_message_add_tail(&state->trx, &state->msg); - hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - state->timer.function = at86rf230_async_state_timer; + hrtimer_setup(&state->timer, at86rf230_async_state_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static irqreturn_t at86rf230_isr(int irq, void *data) diff --git a/drivers/net/ipa/data/ipa_data-v4.7.c b/drivers/net/ipa/data/ipa_data-v4.7.c index c8c23d9be961b..41f212209993f 100644 --- a/drivers/net/ipa/data/ipa_data-v4.7.c +++ b/drivers/net/ipa/data/ipa_data-v4.7.c @@ -28,20 +28,18 @@ enum ipa_resource_type { enum ipa_rsrc_group_id { /* Source resource group identifiers */ IPA_RSRC_GROUP_SRC_UL_DL = 0, - IPA_RSRC_GROUP_SRC_UC_RX_Q, IPA_RSRC_GROUP_SRC_COUNT, /* Last in set; not a source group */ /* Destination resource group identifiers */ - IPA_RSRC_GROUP_DST_UL_DL_DPL = 0, - IPA_RSRC_GROUP_DST_UNUSED_1, + IPA_RSRC_GROUP_DST_UL_DL = 0, IPA_RSRC_GROUP_DST_COUNT, /* Last; not a destination group */ }; /* QSB configuration data for an SoC having IPA v4.7 */ static const struct ipa_qsb_data ipa_qsb_data[] = { [IPA_QSB_MASTER_DDR] = { - .max_writes = 8, - .max_reads = 0, /* no limit (hardware max) */ + .max_writes = 12, + .max_reads = 13, .max_reads_beats = 120, }, }; @@ -81,7 +79,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = { }, .endpoint = { .config = { - .resource_group = IPA_RSRC_GROUP_DST_UL_DL_DPL, + .resource_group = IPA_RSRC_GROUP_DST_UL_DL, .aggregation = true, .status_enable = true, .rx = { @@ -106,6 +104,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = { .filter_support = true, .config = { .resource_group = IPA_RSRC_GROUP_SRC_UL_DL, + .checksum = true, .qmap = true, .status_enable = true, .tx = { @@ -128,7 +127,8 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = { }, .endpoint = { .config = { - .resource_group = IPA_RSRC_GROUP_DST_UL_DL_DPL, + .resource_group = IPA_RSRC_GROUP_DST_UL_DL, + .checksum = true, .qmap = true, .aggregation = true, .rx = { @@ -197,12 +197,12 @@ static const struct ipa_resource ipa_resource_src[] = { /* Destination resource configuration data for an SoC having IPA v4.7 */ static const struct ipa_resource ipa_resource_dst[] = { [IPA_RESOURCE_TYPE_DST_DATA_SECTORS] = { - .limits[IPA_RSRC_GROUP_DST_UL_DL_DPL] = { + .limits[IPA_RSRC_GROUP_DST_UL_DL] = { .min = 7, .max = 7, }, }, [IPA_RESOURCE_TYPE_DST_DPS_DMARS] = { - .limits[IPA_RSRC_GROUP_DST_UL_DL_DPL] = { + .limits[IPA_RSRC_GROUP_DST_UL_DL] = { .min = 2, .max = 2, }, }, diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index fd591ddb3884d..ca62188a317ad 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -416,20 +416,25 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { - const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - struct rtable *rt; int err, ret = NET_XMIT_DROP; + const struct iphdr *ip4h; + struct rtable *rt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, }; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + goto err; + + ip4h = ip_hdr(skb); + fl4.daddr = ip4h->daddr; + fl4.saddr = ip4h->saddr; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; @@ -488,6 +493,12 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) { + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); + return ret; + } + err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index c8840c3b9a1bc..f1d68153987e1 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -244,8 +244,22 @@ static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } +static int blackhole_neigh_output(struct neighbour *n, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int blackhole_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + n->output = blackhole_neigh_output; + return 0; +} + static const struct net_device_ops blackhole_netdev_ops = { .ndo_start_xmit = blackhole_netdev_xmit, + .ndo_neigh_construct = blackhole_neigh_construct, }; /* This is a dst-dummy device used specifically for invalidated diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index d247fe483c588..c1e72253063b5 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -507,6 +507,9 @@ static int mctp_i3c_header_create(struct sk_buff *skb, struct net_device *dev, { struct mctp_i3c_internal_hdr *ihdr; + if (!daddr || !saddr) + return -EINVAL; + skb_push(skb, sizeof(struct mctp_i3c_internal_hdr)); skb_reset_mac_header(skb); ihdr = (void *)skb_mac_header(skb); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 5c80fbee79138..7ab358616e035 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -184,9 +184,11 @@ static const struct ethtool_ops nsim_ethtool_ops = { static void nsim_ethtool_ring_init(struct netdevsim *ns) { + ns->ethtool.ring.rx_pending = 512; ns->ethtool.ring.rx_max_pending = 4096; ns->ethtool.ring.rx_jumbo_max_pending = 4096; ns->ethtool.ring.rx_mini_max_pending = 4096; + ns->ethtool.ring.tx_pending = 512; ns->ethtool.ring.tx_max_pending = 4096; } diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c index 3279de857b474..2ad8c2586d643 100644 --- a/drivers/net/phy/qcom/qca807x.c +++ b/drivers/net/phy/qcom/qca807x.c @@ -774,7 +774,7 @@ static int qca807x_config_init(struct phy_device *phydev) control_dac &= ~QCA807X_CONTROL_DAC_MASK; if (!priv->dac_full_amplitude) control_dac |= QCA807X_CONTROL_DAC_DSP_AMPLITUDE; - if (!priv->dac_full_amplitude) + if (!priv->dac_full_bias_current) control_dac |= QCA807X_CONTROL_DAC_DSP_BIAS_CURRENT; if (!priv->dac_disable_bias_current_tweak) control_dac |= QCA807X_CONTROL_DAC_BIAS_CURRENT_TWEAK; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 4583e15ad03a0..1420c4efa48e6 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -72,6 +72,17 @@ #define PPP_PROTO_LEN 2 #define PPP_LCP_HDRLEN 4 +/* The filter instructions generated by libpcap are constructed + * assuming a four-byte PPP header on each packet, where the last + * 2 bytes are the protocol field defined in the RFC and the first + * byte of the first 2 bytes indicates the direction. + * The second byte is currently unused, but we still need to initialize + * it to prevent crafted BPF programs from reading them which would + * cause reading of uninitialized data. + */ +#define PPP_FILTER_OUTBOUND_TAG 0x0100 +#define PPP_FILTER_INBOUND_TAG 0x0000 + /* * An instance of /dev/ppp can be associated with either a ppp * interface unit or a ppp channel. In both cases, file->private_data @@ -1762,10 +1773,10 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) if (proto < 0x8000) { #ifdef CONFIG_PPP_FILTER - /* check if we should pass this packet */ - /* the filter instructions are constructed assuming - a four-byte PPP header on each packet */ - *(u8 *)skb_push(skb, 2) = 1; + /* check if the packet passes the pass and active filters. + * See comment for PPP_FILTER_OUTBOUND_TAG above. + */ + *(__be16 *)skb_push(skb, 2) = htons(PPP_FILTER_OUTBOUND_TAG); if (ppp->pass_filter && bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) @@ -2482,14 +2493,13 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) /* network protocol frame - give it to the kernel */ #ifdef CONFIG_PPP_FILTER - /* check if the packet passes the pass and active filters */ - /* the filter instructions are constructed assuming - a four-byte PPP header on each packet */ if (ppp->pass_filter || ppp->active_filter) { if (skb_unclone(skb, GFP_ATOMIC)) goto err; - - *(u8 *)skb_push(skb, 2) = 0; + /* Check if the packet passes the pass and active filters. + * See comment for PPP_FILTER_INBOUND_TAG above. + */ + *(__be16 *)skb_push(skb, 2) = htons(PPP_FILTER_INBOUND_TAG); if (ppp->pass_filter && bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index d5c47a2a62dcc..34e82f1e37d96 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -833,8 +833,7 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ ctx->dev = dev; - hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ctx->tx_timer.function = &cdc_ncm_tx_timer_cb; + hrtimer_setup(&ctx->tx_timer, &cdc_ncm_tx_timer_cb, CLOCK_MONOTONIC, HRTIMER_MODE_REL); tasklet_setup(&ctx->bh, cdc_ncm_txpath_bh); atomic_set(&ctx->stop, 0); spin_lock_init(&ctx->mtx); diff --git a/drivers/net/usb/gl620a.c b/drivers/net/usb/gl620a.c index 46af78caf457a..0bfa37c140591 100644 --- a/drivers/net/usb/gl620a.c +++ b/drivers/net/usb/gl620a.c @@ -179,9 +179,7 @@ static int genelink_bind(struct usbnet *dev, struct usb_interface *intf) { dev->hard_mtu = GL_RCV_BUF_SIZE; dev->net->hard_header_len += 4; - dev->in = usb_rcvbulkpipe(dev->udev, dev->driver_info->in); - dev->out = usb_sndbulkpipe(dev->udev, dev->driver_info->out); - return 0; + return usbnet_get_endpoints(dev, intf); } static const struct driver_info genelink_info = { diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index 60eb95fc19a5a..6bc107476a2a3 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -1172,6 +1172,7 @@ static int brcmf_ops_sdio_suspend(struct device *dev) struct brcmf_bus *bus_if; struct brcmf_sdio_dev *sdiodev; mmc_pm_flag_t sdio_flags; + bool cap_power_off; int ret = 0; func = container_of(dev, struct sdio_func, dev); @@ -1179,19 +1180,23 @@ static int brcmf_ops_sdio_suspend(struct device *dev) if (func->num != 1) return 0; + cap_power_off = !!(func->card->host->caps & MMC_CAP_POWER_OFF_CARD); bus_if = dev_get_drvdata(dev); sdiodev = bus_if->bus_priv.sdio; - if (sdiodev->wowl_enabled) { + if (sdiodev->wowl_enabled || !cap_power_off) { brcmf_sdiod_freezer_on(sdiodev); brcmf_sdio_wd_timer(sdiodev->bus, 0); sdio_flags = MMC_PM_KEEP_POWER; - if (sdiodev->settings->bus.sdio.oob_irq_supported) - enable_irq_wake(sdiodev->settings->bus.sdio.oob_irq_nr); - else - sdio_flags |= MMC_PM_WAKE_SDIO_IRQ; + + if (sdiodev->wowl_enabled) { + if (sdiodev->settings->bus.sdio.oob_irq_supported) + enable_irq_wake(sdiodev->settings->bus.sdio.oob_irq_nr); + else + sdio_flags |= MMC_PM_WAKE_SDIO_IRQ; + } if (sdio_set_host_pm_flags(sdiodev->func1, sdio_flags)) brcmf_err("Failed to set pm_flags %x\n", sdio_flags); @@ -1213,18 +1218,19 @@ static int brcmf_ops_sdio_resume(struct device *dev) struct brcmf_sdio_dev *sdiodev = bus_if->bus_priv.sdio; struct sdio_func *func = container_of(dev, struct sdio_func, dev); int ret = 0; + bool cap_power_off = !!(func->card->host->caps & MMC_CAP_POWER_OFF_CARD); brcmf_dbg(SDIO, "Enter: F%d\n", func->num); if (func->num != 2) return 0; - if (!sdiodev->wowl_enabled) { + if (!sdiodev->wowl_enabled && cap_power_off) { /* bus was powered off and device removed, probe again */ ret = brcmf_sdiod_probe(sdiodev); if (ret) brcmf_err("Failed to probe device on resume\n"); } else { - if (sdiodev->settings->bus.sdio.oob_irq_supported) + if (sdiodev->wowl_enabled && sdiodev->settings->bus.sdio.oob_irq_supported) disable_irq_wake(sdiodev->settings->bus.sdio.oob_irq_nr); brcmf_sdiod_freezer_off(sdiodev); diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index fb2ea38e89aca..6594216f873c4 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -558,41 +558,71 @@ static void iwl_dump_prph(struct iwl_fw_runtime *fwrt, } /* - * alloc_sgtable - allocates scallerlist table in the given size, - * fills it with pages and returns it + * alloc_sgtable - allocates (chained) scatterlist in the given size, + * fills it with pages and returns it * @size: the size (in bytes) of the table -*/ -static struct scatterlist *alloc_sgtable(int size) + */ +static struct scatterlist *alloc_sgtable(ssize_t size) { - int alloc_size, nents, i; - struct page *new_page; - struct scatterlist *iter; - struct scatterlist *table; + struct scatterlist *result = NULL, *prev; + int nents, i, n_prev; nents = DIV_ROUND_UP(size, PAGE_SIZE); - table = kcalloc(nents, sizeof(*table), GFP_KERNEL); - if (!table) - return NULL; - sg_init_table(table, nents); - iter = table; - for_each_sg(table, iter, sg_nents(table), i) { - new_page = alloc_page(GFP_KERNEL); - if (!new_page) { - /* release all previous allocated pages in the table */ - iter = table; - for_each_sg(table, iter, sg_nents(table), i) { - new_page = sg_page(iter); - if (new_page) - __free_page(new_page); - } - kfree(table); + +#define N_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(*result)) + /* + * We need an additional entry for table chaining, + * this ensures the loop can finish i.e. we can + * fit at least two entries per page (obviously, + * many more really fit.) + */ + BUILD_BUG_ON(N_ENTRIES_PER_PAGE < 2); + + while (nents > 0) { + struct scatterlist *new, *iter; + int n_fill, n_alloc; + + if (nents <= N_ENTRIES_PER_PAGE) { + /* last needed table */ + n_fill = nents; + n_alloc = nents; + nents = 0; + } else { + /* fill a page with entries */ + n_alloc = N_ENTRIES_PER_PAGE; + /* reserve one for chaining */ + n_fill = n_alloc - 1; + nents -= n_fill; + } + + new = kcalloc(n_alloc, sizeof(*new), GFP_KERNEL); + if (!new) { + if (result) + _devcd_free_sgtable(result); return NULL; } - alloc_size = min_t(int, size, PAGE_SIZE); - size -= PAGE_SIZE; - sg_set_page(iter, new_page, alloc_size, 0); + sg_init_table(new, n_alloc); + + if (!result) + result = new; + else + sg_chain(prev, n_prev, new); + prev = new; + n_prev = n_alloc; + + for_each_sg(new, iter, n_fill, i) { + struct page *new_page = alloc_page(GFP_KERNEL); + + if (!new_page) { + _devcd_free_sgtable(result); + return NULL; + } + + sg_set_page(iter, new_page, PAGE_SIZE, 0); + } } - return table; + + return result; } static void iwl_fw_get_prph_len(struct iwl_fw_runtime *fwrt, diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dump.c b/drivers/net/wireless/intel/iwlwifi/fw/dump.c index 8e0c85a1240d7..c7b261c8ec969 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dump.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dump.c @@ -540,6 +540,9 @@ bool iwl_fwrt_read_err_table(struct iwl_trans *trans, u32 base, u32 *err_id) } err_info = {}; int ret; + if (err_id) + *err_id = 0; + if (!base) return false; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index d3a65f33097cb..352b6e73e08f3 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -1181,7 +1181,7 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv, if (tlv_len != sizeof(*fseq_ver)) goto invalid_tlv_len; - IWL_INFO(drv, "TLV_FW_FSEQ_VERSION: %s\n", + IWL_INFO(drv, "TLV_FW_FSEQ_VERSION: %.32s\n", fseq_ver->version); } break; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 129b6bdf9ef90..82ca7f8b1bb27 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -3092,8 +3092,14 @@ static void iwl_mvm_d3_disconnect_iter(void *data, u8 *mac, ieee80211_resume_disconnect(vif); } -static bool iwl_mvm_check_rt_status(struct iwl_mvm *mvm, - struct ieee80211_vif *vif) +enum rt_status { + FW_ALIVE, + FW_NEEDS_RESET, + FW_ERROR, +}; + +static enum rt_status iwl_mvm_check_rt_status(struct iwl_mvm *mvm, + struct ieee80211_vif *vif) { u32 err_id; @@ -3101,29 +3107,35 @@ static bool iwl_mvm_check_rt_status(struct iwl_mvm *mvm, if (iwl_fwrt_read_err_table(mvm->trans, mvm->trans->dbg.lmac_error_event_table[0], &err_id)) { - if (err_id == RF_KILL_INDICATOR_FOR_WOWLAN && vif) { - struct cfg80211_wowlan_wakeup wakeup = { - .rfkill_release = true, - }; - ieee80211_report_wowlan_wakeup(vif, &wakeup, - GFP_KERNEL); + if (err_id == RF_KILL_INDICATOR_FOR_WOWLAN) { + IWL_WARN(mvm, "Rfkill was toggled during suspend\n"); + if (vif) { + struct cfg80211_wowlan_wakeup wakeup = { + .rfkill_release = true, + }; + + ieee80211_report_wowlan_wakeup(vif, &wakeup, + GFP_KERNEL); + } + + return FW_NEEDS_RESET; } - return true; + return FW_ERROR; } /* check if we have lmac2 set and check for error */ if (iwl_fwrt_read_err_table(mvm->trans, mvm->trans->dbg.lmac_error_event_table[1], NULL)) - return true; + return FW_ERROR; /* check for umac error */ if (iwl_fwrt_read_err_table(mvm->trans, mvm->trans->dbg.umac_error_event_table, NULL)) - return true; + return FW_ERROR; - return false; + return FW_ALIVE; } /* @@ -3492,6 +3504,7 @@ static int __iwl_mvm_resume(struct iwl_mvm *mvm, bool test) bool d0i3_first = fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_D0I3_END_FIRST); bool resume_notif_based = iwl_mvm_d3_resume_notif_based(mvm); + enum rt_status rt_status; bool keep = false; mutex_lock(&mvm->mutex); @@ -3515,14 +3528,19 @@ static int __iwl_mvm_resume(struct iwl_mvm *mvm, bool test) iwl_fw_dbg_read_d3_debug_data(&mvm->fwrt); - if (iwl_mvm_check_rt_status(mvm, vif)) { - IWL_ERR(mvm, "FW Error occurred during suspend. Restarting.\n"); + rt_status = iwl_mvm_check_rt_status(mvm, vif); + if (rt_status != FW_ALIVE) { set_bit(STATUS_FW_ERROR, &mvm->trans->status); - iwl_mvm_dump_nic_error_log(mvm); - iwl_dbg_tlv_time_point(&mvm->fwrt, - IWL_FW_INI_TIME_POINT_FW_ASSERT, NULL); - iwl_fw_dbg_collect_desc(&mvm->fwrt, &iwl_dump_desc_assert, - false, 0); + if (rt_status == FW_ERROR) { + IWL_ERR(mvm, "FW Error occurred during suspend. Restarting.\n"); + iwl_mvm_dump_nic_error_log(mvm); + iwl_dbg_tlv_time_point(&mvm->fwrt, + IWL_FW_INI_TIME_POINT_FW_ASSERT, + NULL); + iwl_fw_dbg_collect_desc(&mvm->fwrt, + &iwl_dump_desc_assert, + false, 0); + } ret = 1; goto err; } @@ -3679,6 +3697,7 @@ int iwl_mvm_fast_resume(struct iwl_mvm *mvm) .notif_expected = IWL_D3_NOTIF_D3_END_NOTIF, }; + enum rt_status rt_status; int ret; lockdep_assert_held(&mvm->mutex); @@ -3688,14 +3707,20 @@ int iwl_mvm_fast_resume(struct iwl_mvm *mvm) mvm->last_reset_or_resume_time_jiffies = jiffies; iwl_fw_dbg_read_d3_debug_data(&mvm->fwrt); - if (iwl_mvm_check_rt_status(mvm, NULL)) { - IWL_ERR(mvm, "FW Error occurred during suspend. Restarting.\n"); + rt_status = iwl_mvm_check_rt_status(mvm, NULL); + if (rt_status != FW_ALIVE) { set_bit(STATUS_FW_ERROR, &mvm->trans->status); - iwl_mvm_dump_nic_error_log(mvm); - iwl_dbg_tlv_time_point(&mvm->fwrt, - IWL_FW_INI_TIME_POINT_FW_ASSERT, NULL); - iwl_fw_dbg_collect_desc(&mvm->fwrt, &iwl_dump_desc_assert, - false, 0); + if (rt_status == FW_ERROR) { + IWL_ERR(mvm, + "iwl_mvm_check_rt_status failed, device is gone during suspend\n"); + iwl_mvm_dump_nic_error_log(mvm); + iwl_dbg_tlv_time_point(&mvm->fwrt, + IWL_FW_INI_TIME_POINT_FW_ASSERT, + NULL); + iwl_fw_dbg_collect_desc(&mvm->fwrt, + &iwl_dump_desc_assert, + false, 0); + } mvm->trans->state = IWL_TRANS_NO_FW; ret = -ENODEV; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 83e3c11603622..55d035b896e91 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -1479,6 +1479,13 @@ static ssize_t iwl_dbgfs_fw_dbg_clear_write(struct iwl_mvm *mvm, if (mvm->trans->trans_cfg->device_family < IWL_DEVICE_FAMILY_9000) return -EOPNOTSUPP; + /* + * If the firmware is not running, silently succeed since there is + * no data to clear. + */ + if (!iwl_mvm_firmware_running(mvm)) + return count; + mutex_lock(&mvm->mutex); iwl_fw_dbg_clear_monitor_buf(&mvm->fwrt); mutex_unlock(&mvm->mutex); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 09fd8752046ee..14ea89f931bbf 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -995,7 +995,7 @@ iwl_mvm_decode_he_phy_ru_alloc(struct iwl_mvm_rx_phy_data *phy_data, */ u8 ru = le32_get_bits(phy_data->d1, IWL_RX_PHY_DATA1_HE_RU_ALLOC_MASK); u32 rate_n_flags = phy_data->rate_n_flags; - u32 he_type = rate_n_flags & RATE_MCS_HE_TYPE_MSK_V1; + u32 he_type = rate_n_flags & RATE_MCS_HE_TYPE_MSK; u8 offs = 0; rx_status->bw = RATE_INFO_BW_HE_RU; @@ -1050,13 +1050,13 @@ iwl_mvm_decode_he_phy_ru_alloc(struct iwl_mvm_rx_phy_data *phy_data, if (he_mu) he_mu->flags2 |= - le16_encode_bits(FIELD_GET(RATE_MCS_CHAN_WIDTH_MSK_V1, + le16_encode_bits(FIELD_GET(RATE_MCS_CHAN_WIDTH_MSK, rate_n_flags), IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW); - else if (he_type == RATE_MCS_HE_TYPE_TRIG_V1) + else if (he_type == RATE_MCS_HE_TYPE_TRIG) he->data6 |= cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA6_TB_PPDU_BW_KNOWN) | - le16_encode_bits(FIELD_GET(RATE_MCS_CHAN_WIDTH_MSK_V1, + le16_encode_bits(FIELD_GET(RATE_MCS_CHAN_WIDTH_MSK, rate_n_flags), IEEE80211_RADIOTAP_HE_DATA6_TB_PPDU_BW); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 9216c43a35c4d..ebfa88b38b71b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -1030,6 +1030,8 @@ void iwl_mvm_rx_session_protect_notif(struct iwl_mvm *mvm, /* End TE, notify mac80211 */ mvmvif->time_event_data.id = SESSION_PROTECT_CONF_MAX_ID; mvmvif->time_event_data.link_id = -1; + /* set the bit so the ROC cleanup will actually clean up */ + set_bit(IWL_MVM_STATUS_ROC_P2P_RUNNING, &mvm->status); iwl_mvm_roc_finished(mvm); ieee80211_remain_on_channel_expired(mvm->hw); } else if (le32_to_cpu(notif->start)) { diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 856b7e9f717d5..45460f93d24ad 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* - * Copyright (C) 2003-2015, 2018-2024 Intel Corporation + * Copyright (C) 2003-2015, 2018-2025 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -646,7 +646,8 @@ dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, unsigned int len); struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *cmd_meta, - u8 **hdr, unsigned int hdr_room); + u8 **hdr, unsigned int hdr_room, + unsigned int offset); void iwl_pcie_free_tso_pages(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *cmd_meta); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 1f483f15c2383..401919f9fe88e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020, 2023-2024 Intel Corporation + * Copyright (C) 2018-2020, 2023-2025 Intel Corporation */ #include #include @@ -188,7 +188,8 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans, (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)); /* Our device supports 9 segments at most, it will fit in 1 page */ - sgt = iwl_pcie_prep_tso(trans, skb, out_meta, &start_hdr, hdr_room); + sgt = iwl_pcie_prep_tso(trans, skb, out_meta, &start_hdr, hdr_room, + snap_ip_tcp_hdrlen + hdr_len); if (!sgt) return -ENOMEM; @@ -347,6 +348,7 @@ iwl_tfh_tfd *iwl_txq_gen2_build_tx_amsdu(struct iwl_trans *trans, return tfd; out_err: + iwl_pcie_free_tso_pages(trans, skb, out_meta); iwl_txq_gen2_tfd_unmap(trans, out_meta, tfd); return NULL; } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 334ebd4c12fa7..7b6071a59b694 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2003-2014, 2018-2021, 2023-2024 Intel Corporation + * Copyright (C) 2003-2014, 2018-2021, 2023-2025 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -1855,6 +1855,7 @@ dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, * @cmd_meta: command meta to store the scatter list information for unmapping * @hdr: output argument for TSO headers * @hdr_room: requested length for TSO headers + * @offset: offset into the data from which mapping should start * * Allocate space for a scatter gather list and TSO headers and map the SKB * using the scatter gather list. The SKB is unmapped again when the page is @@ -1864,18 +1865,20 @@ dma_addr_t iwl_pcie_get_sgt_tb_phys(struct sg_table *sgt, unsigned int offset, */ struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *cmd_meta, - u8 **hdr, unsigned int hdr_room) + u8 **hdr, unsigned int hdr_room, + unsigned int offset) { struct sg_table *sgt; + unsigned int n_segments; if (WARN_ON_ONCE(skb_has_frag_list(skb))) return NULL; + n_segments = DIV_ROUND_UP(skb->len - offset, skb_shinfo(skb)->gso_size); *hdr = iwl_pcie_get_page_hdr(trans, hdr_room + __alignof__(struct sg_table) + sizeof(struct sg_table) + - (skb_shinfo(skb)->nr_frags + 1) * - sizeof(struct scatterlist), + n_segments * sizeof(struct scatterlist), skb); if (!*hdr) return NULL; @@ -1883,11 +1886,11 @@ struct sg_table *iwl_pcie_prep_tso(struct iwl_trans *trans, struct sk_buff *skb, sgt = (void *)PTR_ALIGN(*hdr + hdr_room, __alignof__(struct sg_table)); sgt->sgl = (void *)(sgt + 1); - sg_init_table(sgt->sgl, skb_shinfo(skb)->nr_frags + 1); + sg_init_table(sgt->sgl, n_segments); /* Only map the data, not the header (it is copied to the TSO page) */ - sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, skb_headlen(skb), - skb->data_len); + sgt->orig_nents = skb_to_sgvec(skb, sgt->sgl, offset, + skb->len - offset); if (WARN_ON_ONCE(sgt->orig_nents <= 0)) return NULL; @@ -1939,7 +1942,8 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, (3 + snap_ip_tcp_hdrlen + sizeof(struct ethhdr)) + iv_len; /* Our device supports 9 segments at most, it will fit in 1 page */ - sgt = iwl_pcie_prep_tso(trans, skb, out_meta, &start_hdr, hdr_room); + sgt = iwl_pcie_prep_tso(trans, skb, out_meta, &start_hdr, hdr_room, + snap_ip_tcp_hdrlen + hdr_len + iv_len); if (!sgt) return -ENOMEM; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c index 0e1ede9314d8f..4840d0b500b30 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c @@ -264,8 +264,8 @@ void mt76x02u_init_beacon_config(struct mt76x02_dev *dev) }; dev->beacon_ops = &beacon_ops; - hrtimer_init(&dev->pre_tbtt_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dev->pre_tbtt_timer.function = mt76x02u_pre_tbtt_interrupt; + hrtimer_setup(&dev->pre_tbtt_timer, mt76x02u_pre_tbtt_interrupt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); INIT_WORK(&dev->pre_tbtt_work, mt76x02u_pre_tbtt_work); mt76x02_init_beacon_config(dev); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c index 5323acff962a3..45775ecdf2215 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c @@ -842,7 +842,7 @@ int rt2800mmio_probe_hw(struct rt2x00_dev *rt2x00dev) /* * Set txstatus timer function. */ - rt2x00dev->txstatus_timer.function = rt2800mmio_tx_sta_fifo_timeout; + hrtimer_update_function(&rt2x00dev->txstatus_timer, rt2800mmio_tx_sta_fifo_timeout); /* * Overwrite TX done handler diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c index 160bef79acdb7..b51a23300ba24 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c @@ -618,7 +618,7 @@ static int rt2800usb_probe_hw(struct rt2x00_dev *rt2x00dev) /* * Set txstatus timer function. */ - rt2x00dev->txstatus_timer.function = rt2800usb_tx_sta_fifo_timeout; + hrtimer_update_function(&rt2x00dev->txstatus_timer, rt2800usb_tx_sta_fifo_timeout); /* * Overwrite TX done handler diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 9e7d9dbe954cf..432ddfac2c33e 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -1391,8 +1391,8 @@ int rt2x00lib_probe_dev(struct rt2x00_dev *rt2x00dev) mutex_init(&rt2x00dev->conf_mutex); INIT_LIST_HEAD(&rt2x00dev->bar_list); spin_lock_init(&rt2x00dev->bar_list_lock); - hrtimer_init(&rt2x00dev->txstatus_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + hrtimer_setup(&rt2x00dev->txstatus_timer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); set_bit(DEVICE_STATE_PRESENT, &rt2x00dev->flags); diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index cf6a331d40427..fb187a9e984ed 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -5548,10 +5548,8 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); for (i = 0; i < ARRAY_SIZE(data->link_data); i++) { - hrtimer_init(&data->link_data[i].beacon_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_SOFT); - data->link_data[i].beacon_timer.function = - mac80211_hwsim_beacon; + hrtimer_setup(&data->link_data[i].beacon_timer, mac80211_hwsim_beacon, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS_SOFT); data->link_data[i].link_id = i; } diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem.c b/drivers/net/wwan/iosm/iosm_ipc_imem.c index 829515a601b37..530a3ea47a1a7 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_imem.c +++ b/drivers/net/wwan/iosm/iosm_ipc_imem.c @@ -1381,24 +1381,20 @@ struct iosm_imem *ipc_imem_init(struct iosm_pcie *pcie, unsigned int device_id, /* The phase is set to power off. */ ipc_imem->phase = IPC_P_OFF; - hrtimer_init(&ipc_imem->startup_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->startup_timer.function = ipc_imem_startup_timer_cb; + hrtimer_setup(&ipc_imem->startup_timer, ipc_imem_startup_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->tdupdate_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->tdupdate_timer.function = ipc_imem_td_update_timer_cb; + hrtimer_setup(&ipc_imem->tdupdate_timer, ipc_imem_td_update_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->fast_update_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->fast_update_timer.function = ipc_imem_fast_update_timer_cb; + hrtimer_setup(&ipc_imem->fast_update_timer, ipc_imem_fast_update_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->td_alloc_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->td_alloc_timer.function = ipc_imem_td_alloc_timer_cb; + hrtimer_setup(&ipc_imem->td_alloc_timer, ipc_imem_td_alloc_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->adb_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ipc_imem->adb_timer.function = ipc_imem_adb_timer_cb; + hrtimer_setup(&ipc_imem->adb_timer, ipc_imem_adb_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); if (ipc_imem_config(ipc_imem)) { dev_err(ipc_imem->dev, "failed to initialize the imem"); diff --git a/drivers/ntb/test/ntb_pingpong.c b/drivers/ntb/test/ntb_pingpong.c index 8aeca79140509..1c1c74f4ff2d3 100644 --- a/drivers/ntb/test/ntb_pingpong.c +++ b/drivers/ntb/test/ntb_pingpong.c @@ -284,8 +284,7 @@ static struct pp_ctx *pp_create_data(struct ntb_dev *ntb) pp->ntb = ntb; atomic_set(&pp->count, 0); spin_lock_init(&pp->lock); - hrtimer_init(&pp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pp->timer.function = pp_timer_func; + hrtimer_setup(&pp->timer, pp_timer_func, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return pp; } diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 1de11b722f049..a060f69558e76 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1011,25 +1011,37 @@ static void apple_nvme_reset_work(struct work_struct *work) ret = apple_rtkit_shutdown(anv->rtk); if (ret) goto out; + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); } - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + /* + * Only do the soft-reset if the CPU is not running, which means either we + * or the previous stage shut it down cleanly. + */ + if (!(readl(anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL) & + APPLE_ANS_COPROC_CPU_CONTROL_RUN)) { - ret = reset_control_assert(anv->reset); - if (ret) - goto out; + ret = reset_control_assert(anv->reset); + if (ret) + goto out; - ret = apple_rtkit_reinit(anv->rtk); - if (ret) - goto out; + ret = apple_rtkit_reinit(anv->rtk); + if (ret) + goto out; - ret = reset_control_deassert(anv->reset); - if (ret) - goto out; + ret = reset_control_deassert(anv->reset); + if (ret) + goto out; + + writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, + anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + + ret = apple_rtkit_boot(anv->rtk); + } else { + ret = apple_rtkit_wake(anv->rtk); + } - writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, - anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); - ret = apple_rtkit_boot(anv->rtk); if (ret) { dev_err(anv->dev, "ANS did not boot"); goto out; @@ -1516,6 +1528,7 @@ static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev) return anv; put_dev: + apple_nvme_detach_genpd(anv); put_device(anv->dev); return ERR_PTR(ret); } @@ -1549,6 +1562,7 @@ static int apple_nvme_probe(struct platform_device *pdev) nvme_uninit_ctrl(&anv->ctrl); out_put_ctrl: nvme_put_ctrl(&anv->ctrl); + apple_nvme_detach_genpd(anv); return ret; } @@ -1563,9 +1577,12 @@ static void apple_nvme_remove(struct platform_device *pdev) apple_nvme_disable(anv, true); nvme_uninit_ctrl(&anv->ctrl); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } + apple_nvme_detach_genpd(anv); } @@ -1574,8 +1591,11 @@ static void apple_nvme_shutdown(struct platform_device *pdev) struct apple_nvme *anv = platform_get_drvdata(pdev); apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } } static int apple_nvme_resume(struct device *dev) @@ -1592,10 +1612,11 @@ static int apple_nvme_suspend(struct device *dev) apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { ret = apple_rtkit_shutdown(anv->rtk); - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } return ret; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 818d4e49aab51..f028913e2e622 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -564,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; fallthrough; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b8..b9929a5a7f4e3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -781,61 +781,12 @@ nvme_fc_abort_lsops(struct nvme_fc_rport *rport) static void nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) { - enum nvme_ctrl_state state; - unsigned long flags; - dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost. Awaiting " "Reconnect", ctrl->cnum); - spin_lock_irqsave(&ctrl->lock, flags); set_bit(ASSOC_FAILED, &ctrl->flags); - state = nvme_ctrl_state(&ctrl->ctrl); - spin_unlock_irqrestore(&ctrl->lock, flags); - - switch (state) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - /* - * Schedule a controller reset. The reset will terminate the - * association and schedule the reconnect timer. Reconnects - * will be attempted until either the ctlr_loss_tmo - * (max_retries * connect_delay) expires or the remoteport's - * dev_loss_tmo expires. - */ - if (nvme_reset_ctrl(&ctrl->ctrl)) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Couldn't schedule reset.\n", - ctrl->cnum); - nvme_delete_ctrl(&ctrl->ctrl); - } - break; - - case NVME_CTRL_CONNECTING: - /* - * The association has already been terminated and the - * controller is attempting reconnects. No need to do anything - * futher. Reconnects will be attempted until either the - * ctlr_loss_tmo (max_retries * connect_delay) expires or the - * remoteport's dev_loss_tmo expires. - */ - break; - - case NVME_CTRL_RESETTING: - /* - * Controller is already in the process of terminating the - * association. No need to do anything further. The reconnect - * step will kick in naturally after the association is - * terminated. - */ - break; - - case NVME_CTRL_DELETING: - case NVME_CTRL_DELETING_NOIO: - default: - /* no action to take - let it delete */ - break; - } + nvme_reset_ctrl(&ctrl->ctrl); } /** @@ -3071,7 +3022,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) struct nvmefc_ls_rcv_op *disls = NULL; unsigned long flags; int ret; - bool changed; ++ctrl->ctrl.nr_reconnects; @@ -3177,23 +3127,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; if (ret) goto out_term_aen_ops; - spin_lock_irqsave(&ctrl->lock, flags); - if (!test_bit(ASSOC_FAILED, &ctrl->flags)) - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - else + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) { ret = -EIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (ret) goto out_term_aen_ops; + } ctrl->ctrl.nr_reconnects = 0; - - if (changed) - nvme_start_ctrl(&ctrl->ctrl); + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e8930146847af..24e2c702da7a2 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -128,8 +128,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (!nvme_ctrl_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked data buffer\n"); if (has_metadata) { - if (!supports_metadata) - return -EINVAL; + if (!supports_metadata) { + ret = -EINVAL; + goto out; + } if (!nvme_ctrl_meta_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked metadata buffer\n"); @@ -139,8 +141,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct iov_iter iter; /* fixedbufs is only for non-vectored io */ - if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) - return -EINVAL; + if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) { + ret = -EINVAL; + goto out; + } ret = io_uring_cmd_import_fixed(ubuffer, bufflen, rq_data_dir(req), &iter, ioucmd); if (ret < 0) @@ -283,8 +287,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, { if (ns && nsid != ns->head->ns_id) { dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u)" - "of namespace\n", + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", current->comm, nsid, ns->head->ns_id); return false; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9197a5b173fdf..640590b217282 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1982,6 +1982,18 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (offset > bar_size) return; + /* + * Controllers may support a CMB size larger than their BAR, for + * example, due to being behind a bridge. Reduce the CMB to the + * reported size of the BAR + */ + size = min(size, bar_size - offset); + + if (!IS_ALIGNED(size, memremap_compat_align()) || + !IS_ALIGNED(pci_resource_start(pdev, bar), + memremap_compat_align())) + return; + /* * Tell the controller about the host side address mapping the CMB, * and enable CMB decoding for the NVMe 1.4+ scheme: @@ -1992,17 +2004,10 @@ static void nvme_map_cmb(struct nvme_dev *dev) dev->bar + NVME_REG_CMBMSC); } - /* - * Controllers may support a CMB size larger than their BAR, - * for example, due to being behind a bridge. Reduce the CMB to - * the reported size of the BAR - */ - if (size > bar_size - offset) - size = bar_size - offset; - if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { dev_warn(dev->ctrl.device, "failed to register the CMB\n"); + hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC); return; } @@ -3706,6 +3711,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */ diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 841238f38fdda..327f3f2f5399c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -217,6 +217,19 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type) +{ + switch (type) { + case nvme_tcp_c2h_term: + case nvme_tcp_c2h_data: + case nvme_tcp_r2t: + case nvme_tcp_rsp: + return true; + default: + return false; + } +} + /* * Check if the queue is TLS encrypted */ @@ -763,6 +776,40 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return 0; } +static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue, + struct nvme_tcp_term_pdu *pdu) +{ + u16 fes; + const char *msg; + u32 plen = le32_to_cpu(pdu->hdr.plen); + + static const char * const msg_table[] = { + [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field", + [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error", + [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error", + [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range", + [NVME_TCP_FES_DATA_LIMIT_EXCEEDED] = "Data Transfer Limit Exceeded", + [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter", + }; + + if (plen < NVME_TCP_MIN_C2HTERM_PLEN || + plen > NVME_TCP_MAX_C2HTERM_PLEN) { + dev_err(queue->ctrl->ctrl.device, + "Received a malformed C2HTermReq PDU (plen = %u)\n", + plen); + return; + } + + fes = le16_to_cpu(pdu->fes); + if (fes && fes < ARRAY_SIZE(msg_table)) + msg = msg_table[fes]; + else + msg = "Unknown"; + + dev_err(queue->ctrl->ctrl.device, + "Received C2HTermReq (FES = %s)\n", msg); +} + static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -784,6 +831,25 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { + if (!nvme_tcp_recv_pdu_supported(hdr->type)) + goto unsupported_pdu; + + dev_err(queue->ctrl->ctrl.device, + "pdu type %d has unexpected header length (%d)\n", + hdr->type, hdr->hlen); + return -EPROTO; + } + + if (unlikely(hdr->type == nvme_tcp_c2h_term)) { + /* + * C2HTermReq never includes Header or Data digests. + * Skip the checks. + */ + nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); + return -EINVAL; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) @@ -807,10 +873,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: - dev_err(queue->ctrl->ctrl.device, - "unsupported pdu type (%d)\n", hdr->type); - return -EINVAL; + goto unsupported_pdu; } + +unsupported_pdu: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -1449,8 +1518,11 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } + msg.msg_flags = MSG_WAITALL; ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); + if (ret >= 0 && ret < sizeof(*icresp)) + ret = -ECONNRESET; if (ret < 0) { pr_warn("queue %d: failed to receive icresp, error %d\n", nvme_tcp_queue_id(queue), ret); @@ -1565,7 +1637,7 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) ctrl->io_queues[HCTX_TYPE_POLL]; } -/** +/* * Track the number of queues assigned to each cpu using a global per-cpu * counter and select the least used cpu from the mq_map. Our goal is to spread * different controllers I/O threads across different cpu cores. @@ -2653,6 +2725,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + int ret; if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) return 0; @@ -2660,9 +2733,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) set_bit(NVME_TCP_Q_POLLING, &queue->flags); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); - nvme_tcp_try_recv(queue); + ret = nvme_tcp_try_recv(queue); clear_bit(NVME_TCP_Q_POLLING, &queue->flags); - return queue->nr_cqe; + return ret < 0 ? ret : queue->nr_cqe; } static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cdc4a09a6e8a4..2e741696f3712 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -606,6 +606,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_dev_put; } + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_pr_exit; + nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); @@ -613,6 +616,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; +out_pr_exit: + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -638,6 +644,19 @@ void nvmet_ns_disable(struct nvmet_ns *ns) mutex_unlock(&subsys->lock); + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + if (ns->pr.enable) nvmet_pr_exit_ns(ns); @@ -660,22 +679,6 @@ void nvmet_ns_free(struct nvmet_ns *ns) if (ns->nsid == subsys->max_nsid) subsys->max_nsid = nvmet_max_nsid(subsys); - mutex_unlock(&subsys->lock); - - /* - * Now that we removed the namespaces from the lookup list, we - * can kill the per_cpu ref and wait for any remaining references - * to be dropped, as well as a RCU grace period for anyone only - * using the namepace under rcu_read_lock(). Note that we can't - * use call_rcu here as we need to ensure the namespaces have - * been fully destroyed before unloading the module. - */ - percpu_ref_kill(&ns->ref); - synchronize_rcu(); - wait_for_completion(&ns->disable_done); - percpu_ref_exit(&ns->ref); - - mutex_lock(&subsys->lock); subsys->nr_namespaces--; mutex_unlock(&subsys->lock); @@ -705,9 +708,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; - if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) - goto out_free; - if (ns->nsid > subsys->max_nsid) subsys->max_nsid = nsid; @@ -730,8 +730,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) return ns; out_exit: subsys->max_nsid = nvmet_max_nsid(subsys); - percpu_ref_exit(&ns->ref); -out_free: kfree(ns); out_unlock: mutex_unlock(&subsys->lock); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4be8d22d2d8d4..fcf4f460dc9a4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -647,7 +647,6 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, struct nvmet_host *host); void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page); -bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid); #define NVMET_MIN_QUEUE_SIZE 16 #define NVMET_MAX_QUEUE_SIZE 1024 @@ -784,37 +783,37 @@ u16 nvmet_report_invalid_opcode(struct nvmet_req *req); static inline bool nvmet_cc_en(u32 cc) { - return (cc >> NVME_CC_EN_SHIFT) & 0x1; + return (cc & NVME_CC_ENABLE) >> NVME_CC_EN_SHIFT; } static inline u8 nvmet_cc_css(u32 cc) { - return (cc >> NVME_CC_CSS_SHIFT) & 0x7; + return (cc & NVME_CC_CSS_MASK) >> NVME_CC_CSS_SHIFT; } static inline u8 nvmet_cc_mps(u32 cc) { - return (cc >> NVME_CC_MPS_SHIFT) & 0xf; + return (cc & NVME_CC_MPS_MASK) >> NVME_CC_MPS_SHIFT; } static inline u8 nvmet_cc_ams(u32 cc) { - return (cc >> NVME_CC_AMS_SHIFT) & 0x7; + return (cc & NVME_CC_AMS_MASK) >> NVME_CC_AMS_SHIFT; } static inline u8 nvmet_cc_shn(u32 cc) { - return (cc >> NVME_CC_SHN_SHIFT) & 0x3; + return (cc & NVME_CC_SHN_MASK) >> NVME_CC_SHN_SHIFT; } static inline u8 nvmet_cc_iosqes(u32 cc) { - return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOSQES_MASK) >> NVME_CC_IOSQES_SHIFT; } static inline u8 nvmet_cc_iocqes(u32 cc) { - return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOCQES_MASK) >> NVME_CC_IOCQES_SHIFT; } /* Convert a 32-bit number to a 16-bit 0's based number */ diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index ac30b42cc6221..565d2bd36dcde 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -46,7 +46,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); /* * BAR CC register and SQ polling intervals. */ -#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(5) +#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(10) #define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5) #define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000) @@ -1694,6 +1694,7 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) struct nvmet_pci_epf_ctrl *ctrl = container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work); struct nvmet_pci_epf_queue *sq; + unsigned long limit = jiffies; unsigned long last = 0; int i, nr_sqs; @@ -1708,6 +1709,16 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) nr_sqs++; } + /* + * If we have been running for a while, reschedule to let other + * tasks run and to avoid RCU stalls. + */ + if (time_is_before_jiffies(limit + secs_to_jiffies(1))) { + cond_resched(); + limit = jiffies; + continue; + } + if (nr_sqs) { last = jiffies; continue; @@ -1822,14 +1833,14 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) if (ctrl->io_sqes < sizeof(struct nvme_command)) { dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n", ctrl->io_sqes, sizeof(struct nvme_command)); - return -EINVAL; + goto err; } ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc); if (ctrl->io_cqes < sizeof(struct nvme_completion)) { dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n", ctrl->io_sqes, sizeof(struct nvme_completion)); - return -EINVAL; + goto err; } /* Create the admin queue. */ @@ -1844,7 +1855,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) qsize, pci_addr, 0); if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin completion queue\n"); - return -EINVAL; + goto err; } qsize = aqa & 0x00000fff; @@ -1854,17 +1865,22 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin submission queue\n"); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); - return -EINVAL; + goto err; } ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB; ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD; ctrl->enabled = true; + ctrl->csts = NVME_CSTS_RDY; /* Start polling the controller SQs. */ schedule_delayed_work(&ctrl->poll_sqs, 0); return 0; + +err: + ctrl->csts = 0; + return -EINVAL; } static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) @@ -1889,6 +1905,8 @@ static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) /* Delete the admin queue last. */ nvmet_pci_epf_delete_sq(ctrl->tctrl, 0); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); + + ctrl->csts &= ~NVME_CSTS_RDY; } static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) @@ -1903,19 +1921,19 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) old_cc = ctrl->cc; new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC); + if (new_cc == old_cc) + goto reschedule_work; + ctrl->cc = new_cc; if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) { ret = nvmet_pci_epf_enable_ctrl(ctrl); if (ret) - return; - ctrl->csts |= NVME_CSTS_RDY; + goto reschedule_work; } - if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) { + if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) nvmet_pci_epf_disable_ctrl(ctrl); - ctrl->csts &= ~NVME_CSTS_RDY; - } if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) { nvmet_pci_epf_disable_ctrl(ctrl); @@ -1928,6 +1946,7 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) nvmet_update_cc(ctrl->tctrl, ctrl->cc); nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts); +reschedule_work: schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); } diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1afd93026f9bf..2a4536ef61848 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -996,6 +996,27 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, nvmet_req_complete(&cmd->req, status); } +static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&queue->state_lock, flags); + /* + * recheck queue state is not live to prevent a race condition + * with RDMA_CM_EVENT_ESTABLISHED handler. + */ + if (queue->state == NVMET_RDMA_Q_LIVE) + ret = false; + else if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return ret; +} + static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = @@ -1038,17 +1059,9 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) rsp->n_rdma = 0; rsp->invalidate_rkey = 0; - if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { - unsigned long flags; - - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state == NVMET_RDMA_Q_CONNECTING) - list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); - else - nvmet_rdma_put_rsp(rsp); - spin_unlock_irqrestore(&queue->state_lock, flags); + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) && + nvmet_rdma_recv_not_live(queue, rsp)) return; - } nvmet_rdma_handle_command(queue, rsp); } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7c51c2a8c109a..4f9cac8a5abe0 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -571,10 +571,16 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; + enum nvmet_tcp_recv_state queue_state; + struct nvmet_tcp_cmd *queue_cmd; struct nvme_sgl_desc *sgl; u32 len; - if (unlikely(cmd == queue->cmd)) { + /* Pairs with store_release in nvmet_prepare_receive_pdu() */ + queue_state = smp_load_acquire(&queue->rcv_state); + queue_cmd = READ_ONCE(queue->cmd); + + if (unlikely(cmd == queue_cmd)) { sgl = &cmd->req.cmd->common.dptr.sgl; len = le32_to_cpu(sgl->length); @@ -583,7 +589,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) * Avoid using helpers, this might happen before * nvmet_req_init is completed. */ - if (queue->rcv_state == NVMET_TCP_RECV_PDU && + if (queue_state == NVMET_TCP_RECV_PDU && len && len <= cmd->req.port->inline_data_size && nvme_is_write(cmd->req.cmd)) return; @@ -847,8 +853,9 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) { queue->offset = 0; queue->left = sizeof(struct nvme_tcp_hdr); - queue->cmd = NULL; - queue->rcv_state = NVMET_TCP_RECV_PDU; + WRITE_ONCE(queue->cmd, NULL); + /* Ensure rcv_state is visible only after queue->cmd is set */ + smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU); } static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 75e819f66a561..ee2e31522d7ef 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -415,12 +415,12 @@ static int __init __reserved_mem_alloc_size(unsigned long node, const char *unam prop = of_get_flat_dt_prop(node, "alignment", &len); if (prop) { - if (len != dt_root_size_cells * sizeof(__be32)) { + if (len != dt_root_addr_cells * sizeof(__be32)) { pr_err("invalid alignment property in '%s' node.\n", uname); return -EINVAL; } - align = dt_mem_next_cell(dt_root_size_cells, &prop); + align = dt_mem_next_cell(dt_root_addr_cells, &prop); } nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL; diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2fbd379923fd1..5c3054aaec8c1 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -203,6 +203,12 @@ config PCI_P2PDMA P2P DMA transactions must be between devices behind the same root port. + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. + If unsure, say N. config PCI_LABEL diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index d5fcea3d4328b..1a0d0e1a22633 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -1273,9 +1273,8 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn) /* No overflow interrupt? Have to use a timer instead. */ if (!ccn->irq) { dev_info(ccn->dev, "No access to interrupts, using timer.\n"); - hrtimer_init(&ccn->dt.hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ccn->dt.hrtimer.function = arm_ccn_pmu_timer_handler; + hrtimer_setup(&ccn->dt.hrtimer, arm_ccn_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } /* Pick one CPU which we will use to collect data from CCN... */ diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c index 039feded91527..72ac17efd846a 100644 --- a/drivers/perf/marvell_cn10k_ddr_pmu.c +++ b/drivers/perf/marvell_cn10k_ddr_pmu.c @@ -1064,8 +1064,8 @@ static int cn10k_ddr_perf_probe(struct platform_device *pdev) if (!name) return -ENOMEM; - hrtimer_init(&ddr_pmu->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ddr_pmu->hrtimer.function = cn10k_ddr_pmu_timer_handler; + hrtimer_setup(&ddr_pmu->hrtimer, cn10k_ddr_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cpuhp_state_add_instance_nocalls( CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE, diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index cadd60221b8f7..6ed4707bd6bb5 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -752,9 +752,8 @@ static int tx2_uncore_pmu_add_dev(struct tx2_uncore_pmu *tx2_pmu) tx2_pmu->cpu = cpu; if (tx2_pmu->hrtimer_callback) { - hrtimer_init(&tx2_pmu->hrtimer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx2_pmu->hrtimer.function = tx2_pmu->hrtimer_callback; + hrtimer_setup(&tx2_pmu->hrtimer, tx2_pmu->hrtimer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ret = tx2_uncore_pmu_register(tx2_pmu); diff --git a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c index 45004f598e4dc..e4c0a82d16d9e 100644 --- a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c +++ b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c @@ -325,7 +325,7 @@ to_fsl_samsung_hdmi_phy(struct clk_hw *hw) return container_of(hw, struct fsl_samsung_hdmi_phy, hw); } -static void +static int fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, const struct phy_config *cfg) { @@ -341,6 +341,9 @@ fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, break; } + if (unlikely(div == 4)) + return -EINVAL; + writeb(FIELD_PREP(REG12_CK_DIV_MASK, div), phy->regs + PHY_REG(12)); /* @@ -364,6 +367,8 @@ fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, FIELD_PREP(REG14_RP_CODE_MASK, 2) | FIELD_PREP(REG14_TG_CODE_HIGH_MASK, fld_tg_code >> 8), phy->regs + PHY_REG(14)); + + return 0; } static unsigned long fsl_samsung_hdmi_phy_find_pms(unsigned long fout, u8 *p, u16 *m, u8 *s) @@ -466,7 +471,11 @@ static int fsl_samsung_hdmi_phy_configure(struct fsl_samsung_hdmi_phy *phy, writeb(REG21_SEL_TX_CK_INV | FIELD_PREP(REG21_PMS_S_MASK, cfg->pll_div_regs[2] >> 4), phy->regs + PHY_REG(21)); - fsl_samsung_hdmi_phy_configure_pll_lock_det(phy, cfg); + ret = fsl_samsung_hdmi_phy_configure_pll_lock_det(phy, cfg); + if (ret) { + dev_err(phy->dev, "pixclock too large\n"); + return ret; + } writeb(REG33_FIX_DA | REG33_MODE_SET_DONE, phy->regs + PHY_REG(33)); diff --git a/drivers/phy/rockchip/Kconfig b/drivers/phy/rockchip/Kconfig index 2f7a05f21dc59..dcb8e1628632e 100644 --- a/drivers/phy/rockchip/Kconfig +++ b/drivers/phy/rockchip/Kconfig @@ -125,6 +125,7 @@ config PHY_ROCKCHIP_USBDP depends on ARCH_ROCKCHIP && OF depends on TYPEC select GENERIC_PHY + select USB_COMMON help Enable this to support the Rockchip USB3.0/DP combo PHY with Samsung IP block. This is required for USB3 support on RK3588. diff --git a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c index a1532ef8bbe9d..8c3ce57f89151 100644 --- a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c +++ b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c @@ -324,7 +324,10 @@ static int rockchip_combphy_parse_dt(struct device *dev, struct rockchip_combphy priv->ext_refclk = device_property_present(dev, "rockchip,ext-refclk"); - priv->phy_rst = devm_reset_control_get(dev, "phy"); + priv->phy_rst = devm_reset_control_get_exclusive(dev, "phy"); + /* fallback to old behaviour */ + if (PTR_ERR(priv->phy_rst) == -ENOENT) + priv->phy_rst = devm_reset_control_array_get_exclusive(dev); if (IS_ERR(priv->phy_rst)) return dev_err_probe(dev, PTR_ERR(priv->phy_rst), "failed to get phy reset\n"); diff --git a/drivers/phy/samsung/phy-exynos5-usbdrd.c b/drivers/phy/samsung/phy-exynos5-usbdrd.c index c421b495eb0fe..46b8f6987c62c 100644 --- a/drivers/phy/samsung/phy-exynos5-usbdrd.c +++ b/drivers/phy/samsung/phy-exynos5-usbdrd.c @@ -488,9 +488,9 @@ exynos5_usbdrd_pipe3_set_refclk(struct phy_usb_instance *inst) reg |= PHYCLKRST_REFCLKSEL_EXT_REFCLK; /* FSEL settings corresponding to reference clock */ - reg &= ~PHYCLKRST_FSEL_PIPE_MASK | - PHYCLKRST_MPLL_MULTIPLIER_MASK | - PHYCLKRST_SSC_REFCLKSEL_MASK; + reg &= ~(PHYCLKRST_FSEL_PIPE_MASK | + PHYCLKRST_MPLL_MULTIPLIER_MASK | + PHYCLKRST_SSC_REFCLKSEL_MASK); switch (phy_drd->extrefclk) { case EXYNOS5_FSEL_50MHZ: reg |= (PHYCLKRST_MPLL_MULTIPLIER_50M_REF | @@ -532,9 +532,9 @@ exynos5_usbdrd_utmi_set_refclk(struct phy_usb_instance *inst) reg &= ~PHYCLKRST_REFCLKSEL_MASK; reg |= PHYCLKRST_REFCLKSEL_EXT_REFCLK; - reg &= ~PHYCLKRST_FSEL_UTMI_MASK | - PHYCLKRST_MPLL_MULTIPLIER_MASK | - PHYCLKRST_SSC_REFCLKSEL_MASK; + reg &= ~(PHYCLKRST_FSEL_UTMI_MASK | + PHYCLKRST_MPLL_MULTIPLIER_MASK | + PHYCLKRST_SSC_REFCLKSEL_MASK); reg |= PHYCLKRST_FSEL(phy_drd->extrefclk); return reg; @@ -1296,14 +1296,17 @@ static int exynos5_usbdrd_gs101_phy_exit(struct phy *phy) struct exynos5_usbdrd_phy *phy_drd = to_usbdrd_phy(inst); int ret; + if (inst->phy_cfg->id == EXYNOS5_DRDPHY_UTMI) { + ret = exynos850_usbdrd_phy_exit(phy); + if (ret) + return ret; + } + + exynos5_usbdrd_phy_isol(inst, true); + if (inst->phy_cfg->id != EXYNOS5_DRDPHY_UTMI) return 0; - ret = exynos850_usbdrd_phy_exit(phy); - if (ret) - return ret; - - exynos5_usbdrd_phy_isol(inst, true); return regulator_bulk_disable(phy_drd->drv_data->n_regulators, phy_drd->regulators); } diff --git a/drivers/phy/st/phy-stm32-combophy.c b/drivers/phy/st/phy-stm32-combophy.c index 49e9fa90a6819..607b4d607eb5e 100644 --- a/drivers/phy/st/phy-stm32-combophy.c +++ b/drivers/phy/st/phy-stm32-combophy.c @@ -111,6 +111,7 @@ static const struct clk_impedance imp_lookup[] = { { 4204000, { 511000, 609000, 706000, 802000 } }, { 3999000, { 571000, 648000, 726000, 803000 } } }; +#define DEFAULT_IMP_INDEX 3 /* Default impedance is 50 Ohm */ static int stm32_impedance_tune(struct stm32_combophy *combophy) { @@ -119,10 +120,9 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) u8 imp_of, vswing_of; u32 max_imp = imp_lookup[0].microohm; u32 min_imp = imp_lookup[imp_size - 1].microohm; - u32 max_vswing = imp_lookup[imp_size - 1].vswing[vswing_size - 1]; + u32 max_vswing; u32 min_vswing = imp_lookup[0].vswing[0]; u32 val; - u32 regval; if (!of_property_read_u32(combophy->dev->of_node, "st,output-micro-ohms", &val)) { if (val < min_imp || val > max_imp) { @@ -130,45 +130,43 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - regval = 0; - for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) { - if (imp_lookup[imp_of].microohm <= val) { - regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of); + for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) + if (imp_lookup[imp_of].microohm <= val) break; - } - } + + if (WARN_ON(imp_of == ARRAY_SIZE(imp_lookup))) + return -EINVAL; dev_dbg(combophy->dev, "Set %u micro-ohms output impedance\n", imp_lookup[imp_of].microohm); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_OHM, - regval); - } else { - regmap_read(combophy->regmap, SYSCFG_PCIEPRGCR, &val); - imp_of = FIELD_GET(STM32MP25_PCIEPRG_IMPCTRL_OHM, val); - } + FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of)); + } else + imp_of = DEFAULT_IMP_INDEX; if (!of_property_read_u32(combophy->dev->of_node, "st,output-vswing-microvolt", &val)) { + max_vswing = imp_lookup[imp_of].vswing[vswing_size - 1]; + if (val < min_vswing || val > max_vswing) { dev_err(combophy->dev, "Invalid value %u for output vswing\n", val); return -EINVAL; } - regval = 0; - for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) { - if (imp_lookup[imp_of].vswing[vswing_of] >= val) { - regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of); + for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) + if (imp_lookup[imp_of].vswing[vswing_of] >= val) break; - } - } + + if (WARN_ON(vswing_of == ARRAY_SIZE(imp_lookup[imp_of].vswing))) + return -EINVAL; dev_dbg(combophy->dev, "Set %u microvolt swing\n", imp_lookup[imp_of].vswing[vswing_of]); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_VSWING, - regval); + FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); } return 0; diff --git a/drivers/phy/tegra/xusb-tegra186.c b/drivers/phy/tegra/xusb-tegra186.c index 0f60d5d1c1678..fae6242aa730e 100644 --- a/drivers/phy/tegra/xusb-tegra186.c +++ b/drivers/phy/tegra/xusb-tegra186.c @@ -928,6 +928,7 @@ static int tegra186_utmi_phy_init(struct phy *phy) unsigned int index = lane->index; struct device *dev = padctl->dev; int err; + u32 reg; port = tegra_xusb_find_usb2_port(padctl, index); if (!port) { @@ -935,6 +936,16 @@ static int tegra186_utmi_phy_init(struct phy *phy) return -ENODEV; } + if (port->mode == USB_DR_MODE_OTG || + port->mode == USB_DR_MODE_PERIPHERAL) { + /* reset VBUS&ID OVERRIDE */ + reg = padctl_readl(padctl, USB2_VBUS_ID); + reg &= ~VBUS_OVERRIDE; + reg &= ~ID_OVERRIDE(~0); + reg |= ID_OVERRIDE_FLOATING; + padctl_writel(padctl, reg, USB2_VBUS_ID); + } + if (port->supply && port->mode == USB_DR_MODE_HOST) { err = regulator_enable(port->supply); if (err) { diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index e0ca59ae31531..ff5d5e29629fa 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -424,6 +424,12 @@ static int phy_gmii_sel_init_ports(struct phy_gmii_sel_priv *priv) return 0; } +static const struct regmap_config phy_gmii_sel_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, +}; + static int phy_gmii_sel_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -468,7 +474,14 @@ static int phy_gmii_sel_probe(struct platform_device *pdev) priv->regmap = syscon_node_to_regmap(node->parent); if (IS_ERR(priv->regmap)) { - priv->regmap = device_node_to_regmap(node); + void __iomem *base; + + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) + return dev_err_probe(dev, PTR_ERR(base), + "failed to get base memory resource\n"); + + priv->regmap = regmap_init_mmio(dev, base, &phy_gmii_sel_regmap_cfg); if (IS_ERR(priv->regmap)) return dev_err_probe(dev, PTR_ERR(priv->regmap), "Failed to get syscon\n"); diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index 49c383eb67854..13e37b49d9d01 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -6,6 +6,7 @@ menuconfig CZNIC_PLATFORMS bool "Platform support for CZ.NIC's Turris hardware" + depends on ARCH_MVEBU || COMPILE_TEST help Say Y here to be able to choose driver support for CZ.NIC's Turris devices. This option alone does not add any kernel code. diff --git a/drivers/platform/x86/amd/hsmp/Kconfig b/drivers/platform/x86/amd/hsmp/Kconfig index 7d10d4462a453..d6f7a62d55b5e 100644 --- a/drivers/platform/x86/amd/hsmp/Kconfig +++ b/drivers/platform/x86/amd/hsmp/Kconfig @@ -7,7 +7,7 @@ config AMD_HSMP tristate menu "AMD HSMP Driver" - depends on AMD_NB || COMPILE_TEST + depends on AMD_NODE || COMPILE_TEST config AMD_HSMP_ACPI tristate "AMD HSMP ACPI device driver" diff --git a/drivers/platform/x86/amd/hsmp/acpi.c b/drivers/platform/x86/amd/hsmp/acpi.c index 444b43be35a25..c1eccb3c80c5c 100644 --- a/drivers/platform/x86/amd/hsmp/acpi.c +++ b/drivers/platform/x86/amd/hsmp/acpi.c @@ -10,7 +10,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include @@ -24,6 +23,8 @@ #include +#include + #include "hsmp.h" #define DRIVER_NAME "amd_hsmp" @@ -321,8 +322,8 @@ static int hsmp_acpi_probe(struct platform_device *pdev) return -ENOMEM; if (!hsmp_pdev->is_probed) { - hsmp_pdev->num_sockets = amd_nb_num(); - if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_SOCKETS) + hsmp_pdev->num_sockets = amd_num_nodes(); + if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_NUM_NODES) return -ENODEV; hsmp_pdev->sock = devm_kcalloc(&pdev->dev, hsmp_pdev->num_sockets, diff --git a/drivers/platform/x86/amd/hsmp/hsmp.c b/drivers/platform/x86/amd/hsmp/hsmp.c index 03164e30b3a50..a3ac09a90de45 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.c +++ b/drivers/platform/x86/amd/hsmp/hsmp.c @@ -8,7 +8,6 @@ */ #include -#include #include #include diff --git a/drivers/platform/x86/amd/hsmp/hsmp.h b/drivers/platform/x86/amd/hsmp/hsmp.h index e852f0a947e4f..af8b21f821d66 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.h +++ b/drivers/platform/x86/amd/hsmp/hsmp.h @@ -21,8 +21,6 @@ #define HSMP_ATTR_GRP_NAME_SIZE 10 -#define MAX_AMD_SOCKETS 8 - #define HSMP_CDEV_NAME "hsmp_cdev" #define HSMP_DEVNODE_NAME "hsmp" @@ -41,7 +39,6 @@ struct hsmp_socket { void __iomem *virt_base_addr; struct semaphore hsmp_sem; char name[HSMP_ATTR_GRP_NAME_SIZE]; - struct pci_dev *root; struct device *dev; u16 sock_ind; int (*amd_hsmp_rdwr)(struct hsmp_socket *sock, u32 off, u32 *val, bool rw); diff --git a/drivers/platform/x86/amd/hsmp/plat.c b/drivers/platform/x86/amd/hsmp/plat.c index 02ca85762b686..b9782a078dbd2 100644 --- a/drivers/platform/x86/amd/hsmp/plat.c +++ b/drivers/platform/x86/amd/hsmp/plat.c @@ -10,14 +10,16 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include #include #include +#include + #include "hsmp.h" #define DRIVER_NAME "amd_hsmp" @@ -34,28 +36,12 @@ #define SMN_HSMP_MSG_RESP 0x0010980 #define SMN_HSMP_MSG_DATA 0x00109E0 -#define HSMP_INDEX_REG 0xc4 -#define HSMP_DATA_REG 0xc8 - static struct hsmp_plat_device *hsmp_pdev; static int amd_hsmp_pci_rdwr(struct hsmp_socket *sock, u32 offset, u32 *value, bool write) { - int ret; - - if (!sock->root) - return -ENODEV; - - ret = pci_write_config_dword(sock->root, HSMP_INDEX_REG, - sock->mbinfo.base_addr + offset); - if (ret) - return ret; - - ret = (write ? pci_write_config_dword(sock->root, HSMP_DATA_REG, *value) - : pci_read_config_dword(sock->root, HSMP_DATA_REG, value)); - - return ret; + return amd_smn_hsmp_rdwr(sock->sock_ind, sock->mbinfo.base_addr + offset, value, write); } static ssize_t hsmp_metric_tbl_plat_read(struct file *filp, struct kobject *kobj, @@ -95,7 +81,12 @@ static umode_t hsmp_is_sock_attr_visible(struct kobject *kobj, * Static array of 8 + 1(for NULL) elements is created below * to create sysfs groups for sockets. * is_bin_visible function is used to show / hide the necessary groups. + * + * Validate the maximum number against MAX_AMD_NUM_NODES. If this changes, + * then the attributes and groups below must be adjusted. */ +static_assert(MAX_AMD_NUM_NODES == 8); + #define HSMP_BIN_ATTR(index, _list) \ static const struct bin_attribute attr##index = { \ .attr = { .name = HSMP_METRICS_TABLE_NAME, .mode = 0444}, \ @@ -159,10 +150,7 @@ static int init_platform_device(struct device *dev) int ret, i; for (i = 0; i < hsmp_pdev->num_sockets; i++) { - if (!node_to_amd_nb(i)) - return -ENODEV; sock = &hsmp_pdev->sock[i]; - sock->root = node_to_amd_nb(i)->root; sock->sock_ind = i; sock->dev = dev; sock->mbinfo.base_addr = SMN_HSMP_BASE; @@ -305,11 +293,11 @@ static int __init hsmp_plt_init(void) return -ENOMEM; /* - * amd_nb_num() returns number of SMN/DF interfaces present in the system + * amd_num_nodes() returns number of SMN/DF interfaces present in the system * if we have N SMN/DF interfaces that ideally means N sockets */ - hsmp_pdev->num_sockets = amd_nb_num(); - if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_SOCKETS) + hsmp_pdev->num_sockets = amd_num_nodes(); + if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_NUM_NODES) return ret; ret = platform_driver_register(&amd_hsmp_driver); diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 764cc1fe90ae4..a2cb2d5544f5b 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -452,6 +452,7 @@ static int amd_pmf_probe(struct platform_device *pdev) mutex_init(&dev->lock); mutex_init(&dev->update_mutex); + mutex_init(&dev->cb_mutex); apmf_acpi_init(dev); platform_set_drvdata(pdev, dev); @@ -477,6 +478,7 @@ static void amd_pmf_remove(struct platform_device *pdev) amd_pmf_dbgfs_unregister(dev); mutex_destroy(&dev->lock); mutex_destroy(&dev->update_mutex); + mutex_destroy(&dev->cb_mutex); kfree(dev->buf); } diff --git a/drivers/platform/x86/amd/pmf/pmf.h b/drivers/platform/x86/amd/pmf/pmf.h index 41b2b91b8fdc6..e6bdee68ccf34 100644 --- a/drivers/platform/x86/amd/pmf/pmf.h +++ b/drivers/platform/x86/amd/pmf/pmf.h @@ -106,9 +106,12 @@ struct cookie_header { #define PMF_TA_IF_VERSION_MAJOR 1 #define TA_PMF_ACTION_MAX 32 #define TA_PMF_UNDO_MAX 8 -#define TA_OUTPUT_RESERVED_MEM 906 +#define TA_OUTPUT_RESERVED_MEM 922 #define MAX_OPERATION_PARAMS 4 +#define TA_ERROR_CRYPTO_INVALID_PARAM 0x20002 +#define TA_ERROR_CRYPTO_BIN_TOO_LARGE 0x2000d + #define PMF_IF_V1 1 #define PMF_IF_V2 2 diff --git a/drivers/platform/x86/amd/pmf/sps.c b/drivers/platform/x86/amd/pmf/sps.c index e6cf0b22dac33..d3083383f11fb 100644 --- a/drivers/platform/x86/amd/pmf/sps.c +++ b/drivers/platform/x86/amd/pmf/sps.c @@ -297,12 +297,14 @@ int amd_pmf_get_pprof_modes(struct amd_pmf_dev *pmf) switch (pmf->current_profile) { case PLATFORM_PROFILE_PERFORMANCE: + case PLATFORM_PROFILE_BALANCED_PERFORMANCE: mode = POWER_MODE_PERFORMANCE; break; case PLATFORM_PROFILE_BALANCED: mode = POWER_MODE_BALANCED_POWER; break; case PLATFORM_PROFILE_LOW_POWER: + case PLATFORM_PROFILE_QUIET: mode = POWER_MODE_POWER_SAVER; break; default: @@ -387,6 +389,14 @@ static int amd_pmf_profile_set(struct device *dev, return 0; } +static int amd_pmf_hidden_choices(void *drvdata, unsigned long *choices) +{ + set_bit(PLATFORM_PROFILE_QUIET, choices); + set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE, choices); + + return 0; +} + static int amd_pmf_profile_probe(void *drvdata, unsigned long *choices) { set_bit(PLATFORM_PROFILE_LOW_POWER, choices); @@ -398,6 +408,7 @@ static int amd_pmf_profile_probe(void *drvdata, unsigned long *choices) static const struct platform_profile_ops amd_pmf_profile_ops = { .probe = amd_pmf_profile_probe, + .hidden_choices = amd_pmf_hidden_choices, .profile_get = amd_pmf_profile_get, .profile_set = amd_pmf_profile_set, }; diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 8c88769ea1d87..ceaff1ebb7b93 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -27,8 +27,11 @@ module_param(pb_side_load, bool, 0444); MODULE_PARM_DESC(pb_side_load, "Sideload policy binaries debug policy failures"); #endif -static const uuid_t amd_pmf_ta_uuid = UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, - 0xb1, 0x2d, 0xc5, 0x29, 0xb1, 0x3d, 0x85, 0x43); +static const uuid_t amd_pmf_ta_uuid[] = { UUID_INIT(0xd9b39bf2, 0x66bd, 0x4154, 0xaf, 0xb8, 0x8a, + 0xcc, 0x2b, 0x2b, 0x60, 0xd6), + UUID_INIT(0x6fd93b77, 0x3fb8, 0x524d, 0xb1, 0x2d, 0xc5, + 0x29, 0xb1, 0x3d, 0x85, 0x43), + }; static const char *amd_pmf_uevent_as_str(unsigned int state) { @@ -321,9 +324,9 @@ static int amd_pmf_start_policy_engine(struct amd_pmf_dev *dev) */ schedule_delayed_work(&dev->pb_work, msecs_to_jiffies(pb_actions_ms * 3)); } else { - dev_err(dev->dev, "ta invoke cmd init failed err: %x\n", res); + dev_dbg(dev->dev, "ta invoke cmd init failed err: %x\n", res); dev->smart_pc_enabled = false; - return -EIO; + return res; } return 0; @@ -390,12 +393,12 @@ static int amd_pmf_amdtee_ta_match(struct tee_ioctl_version_data *ver, const voi return ver->impl_id == TEE_IMPL_ID_AMDTEE; } -static int amd_pmf_ta_open_session(struct tee_context *ctx, u32 *id) +static int amd_pmf_ta_open_session(struct tee_context *ctx, u32 *id, const uuid_t *uuid) { struct tee_ioctl_open_session_arg sess_arg = {}; int rc; - export_uuid(sess_arg.uuid, &amd_pmf_ta_uuid); + export_uuid(sess_arg.uuid, uuid); sess_arg.clnt_login = TEE_IOCTL_LOGIN_PUBLIC; sess_arg.num_params = 0; @@ -434,7 +437,7 @@ static int amd_pmf_register_input_device(struct amd_pmf_dev *dev) return 0; } -static int amd_pmf_tee_init(struct amd_pmf_dev *dev) +static int amd_pmf_tee_init(struct amd_pmf_dev *dev, const uuid_t *uuid) { u32 size; int ret; @@ -445,7 +448,7 @@ static int amd_pmf_tee_init(struct amd_pmf_dev *dev) return PTR_ERR(dev->tee_ctx); } - ret = amd_pmf_ta_open_session(dev->tee_ctx, &dev->session_id); + ret = amd_pmf_ta_open_session(dev->tee_ctx, &dev->session_id, uuid); if (ret) { dev_err(dev->dev, "Failed to open TA session (%d)\n", ret); ret = -EINVAL; @@ -489,7 +492,8 @@ static void amd_pmf_tee_deinit(struct amd_pmf_dev *dev) int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) { - int ret; + bool status; + int ret, i; ret = apmf_check_smart_pc(dev); if (ret) { @@ -502,10 +506,6 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) return -ENODEV; } - ret = amd_pmf_tee_init(dev); - if (ret) - return ret; - INIT_DELAYED_WORK(&dev->pb_work, amd_pmf_invoke_cmd); ret = amd_pmf_set_dram_addr(dev, true); @@ -534,8 +534,30 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) goto error; } - ret = amd_pmf_start_policy_engine(dev); - if (ret) + for (i = 0; i < ARRAY_SIZE(amd_pmf_ta_uuid); i++) { + ret = amd_pmf_tee_init(dev, &amd_pmf_ta_uuid[i]); + if (ret) + return ret; + + ret = amd_pmf_start_policy_engine(dev); + switch (ret) { + case TA_PMF_TYPE_SUCCESS: + status = true; + break; + case TA_ERROR_CRYPTO_INVALID_PARAM: + case TA_ERROR_CRYPTO_BIN_TOO_LARGE: + amd_pmf_tee_deinit(dev); + status = false; + break; + default: + goto error; + } + + if (status) + break; + } + + if (!status && !pb_side_load) goto error; if (pb_side_load) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 927a2993f6160..88a1a9ff2f344 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -139,6 +139,13 @@ static const struct dmi_system_id button_array_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Surface Go 3"), }, }, + { + .ident = "Microsoft Surface Go 4", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Surface Go 4"), + }, + }, { } }; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 8272f1dd0fbc0..db3c031d17572 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -404,6 +404,11 @@ static const struct intel_vsec_platform_info oobmsm_info = { .caps = VSEC_CAP_TELEMETRY | VSEC_CAP_SDSI | VSEC_CAP_TPMI, }; +/* DMR OOBMSM info */ +static const struct intel_vsec_platform_info dmr_oobmsm_info = { + .caps = VSEC_CAP_TELEMETRY | VSEC_CAP_TPMI, +}; + /* TGL info */ static const struct intel_vsec_platform_info tgl_info = { .caps = VSEC_CAP_TELEMETRY, @@ -420,6 +425,7 @@ static const struct intel_vsec_platform_info lnl_info = { #define PCI_DEVICE_ID_INTEL_VSEC_MTL_M 0x7d0d #define PCI_DEVICE_ID_INTEL_VSEC_MTL_S 0xad0d #define PCI_DEVICE_ID_INTEL_VSEC_OOBMSM 0x09a7 +#define PCI_DEVICE_ID_INTEL_VSEC_OOBMSM_DMR 0x09a1 #define PCI_DEVICE_ID_INTEL_VSEC_RPL 0xa77d #define PCI_DEVICE_ID_INTEL_VSEC_TGL 0x9a0d #define PCI_DEVICE_ID_INTEL_VSEC_LNL_M 0x647d @@ -430,6 +436,7 @@ static const struct pci_device_id intel_vsec_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, VSEC_MTL_M, &mtl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_MTL_S, &mtl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_OOBMSM, &oobmsm_info) }, + { PCI_DEVICE_DATA(INTEL, VSEC_OOBMSM_DMR, &dmr_oobmsm_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_RPL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_TGL, &tgl_info) }, { PCI_DEVICE_DATA(INTEL, VSEC_LNL_M, &lnl_info) }, diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 72a10ed2017ce..1cc91173e0127 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -9972,6 +9972,7 @@ static const struct tpacpi_quirk battery_quirk_table[] __initconst = { * Individual addressing is broken on models that expose the * primary battery as BAT1. */ + TPACPI_Q_LNV('G', '8', true), /* ThinkPad X131e */ TPACPI_Q_LNV('8', 'F', true), /* Thinkpad X120e */ TPACPI_Q_LNV('J', '7', true), /* B5400 */ TPACPI_Q_LNV('J', 'I', true), /* Thinkpad 11e */ diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index 1a6fc8d38e206..90c664d344d0f 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -162,11 +162,11 @@ static void ltc2952_poweroff_default(struct ltc2952_poweroff *data) data->wde_interval = 300L * NSEC_PER_MSEC; data->trigger_delay = ktime_set(2, 500L * NSEC_PER_MSEC); - hrtimer_init(&data->timer_trigger, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->timer_trigger.function = ltc2952_poweroff_timer_trigger; + hrtimer_setup(&data->timer_trigger, ltc2952_poweroff_timer_trigger, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&data->timer_wde, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->timer_wde.function = ltc2952_poweroff_timer_wde; + hrtimer_setup(&data->timer_wde, ltc2952_poweroff_timer_wde, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static int ltc2952_poweroff_init(struct platform_device *pdev) diff --git a/drivers/power/supply/ab8500_chargalg.c b/drivers/power/supply/ab8500_chargalg.c index 7a8d1feb8e905..dc6c8b0dd1cfd 100644 --- a/drivers/power/supply/ab8500_chargalg.c +++ b/drivers/power/supply/ab8500_chargalg.c @@ -1787,13 +1787,12 @@ static int ab8500_chargalg_probe(struct platform_device *pdev) psy_cfg.drv_data = di; /* Initilialize safety timer */ - hrtimer_init(&di->safety_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - di->safety_timer.function = ab8500_chargalg_safety_timer_expired; + hrtimer_setup(&di->safety_timer, ab8500_chargalg_safety_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Initilialize maintenance timer */ - hrtimer_init(&di->maintenance_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - di->maintenance_timer.function = - ab8500_chargalg_maintenance_timer_expired; + hrtimer_setup(&di->maintenance_timer, ab8500_chargalg_maintenance_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); /* Init work for chargalg */ INIT_DEFERRABLE_WORK(&di->chargalg_periodic_work, diff --git a/drivers/power/supply/axp20x_battery.c b/drivers/power/supply/axp20x_battery.c index fa27195f074e7..3c3158f31a484 100644 --- a/drivers/power/supply/axp20x_battery.c +++ b/drivers/power/supply/axp20x_battery.c @@ -466,10 +466,9 @@ static int axp717_battery_get_prop(struct power_supply *psy, /* * If a fault is detected it must also be cleared; if the - * condition persists it should reappear (This is an - * assumption, it's actually not documented). A restart was - * not sufficient to clear the bit in testing despite the - * register listed as POR. + * condition persists it should reappear. A restart was not + * sufficient to clear the bit in testing despite the register + * listed as POR. */ case POWER_SUPPLY_PROP_HEALTH: ret = regmap_read(axp20x_batt->regmap, AXP717_PMU_FAULT, @@ -480,26 +479,26 @@ static int axp717_battery_get_prop(struct power_supply *psy, switch (reg & AXP717_BATT_PMU_FAULT_MASK) { case AXP717_BATT_UVLO_2_5V: val->intval = POWER_SUPPLY_HEALTH_DEAD; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_UVLO_2_5V, - AXP717_BATT_UVLO_2_5V); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_UVLO_2_5V, + AXP717_BATT_UVLO_2_5V); return 0; case AXP717_BATT_OVER_TEMP: val->intval = POWER_SUPPLY_HEALTH_HOT; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_OVER_TEMP, - AXP717_BATT_OVER_TEMP); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_OVER_TEMP, + AXP717_BATT_OVER_TEMP); return 0; case AXP717_BATT_UNDER_TEMP: val->intval = POWER_SUPPLY_HEALTH_COLD; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_UNDER_TEMP, - AXP717_BATT_UNDER_TEMP); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_UNDER_TEMP, + AXP717_BATT_UNDER_TEMP); return 0; default: diff --git a/drivers/power/supply/da9150-fg.c b/drivers/power/supply/da9150-fg.c index 652c1f213af1c..4f28ef1bba1a3 100644 --- a/drivers/power/supply/da9150-fg.c +++ b/drivers/power/supply/da9150-fg.c @@ -247,9 +247,9 @@ static int da9150_fg_current_avg(struct da9150_fg *fg, DA9150_QIF_SD_GAIN_SIZE); da9150_fg_read_sync_end(fg); - div = (u64) (sd_gain * shunt_val * 65536ULL); + div = 65536ULL * sd_gain * shunt_val; do_div(div, 1000000); - res = (u64) (iavg * 1000000ULL); + res = 1000000ULL * iavg; do_div(res, div); val->intval = (int) res; diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index d0bb52a7a0367..76c340b38015a 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -1592,11 +1592,11 @@ __power_supply_register(struct device *parent, if (rc) goto register_thermal_failed; - scoped_guard(rwsem_read, &psy->extensions_sem) { - rc = power_supply_create_triggers(psy); - if (rc) - goto create_triggers_failed; + rc = power_supply_create_triggers(psy); + if (rc) + goto create_triggers_failed; + scoped_guard(rwsem_read, &psy->extensions_sem) { rc = power_supply_add_hwmon_sysfs(psy); if (rc) goto add_hwmon_sysfs_failed; diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index 04c212953ded6..5ad7cc438068e 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -339,8 +339,7 @@ struct idle_inject_device *idle_inject_register_full(struct cpumask *cpumask, return NULL; cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask); - hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ii_dev->timer.function = idle_inject_timer_fn; + hrtimer_setup(&ii_dev->timer, idle_inject_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ii_dev->latency_us = UINT_MAX; ii_dev->update = update; diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 77d75e1f14a9f..5ab3feb296868 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1274,7 +1274,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2,&rapl_defaults_ann), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), @@ -2064,8 +2064,7 @@ int rapl_package_add_pmu(struct rapl_package *rp) raw_spin_lock_init(&data->lock); INIT_LIST_HEAD(&data->active_list); data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); - hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->hrtimer.function = rapl_hrtimer_handle; + hrtimer_setup(&data->hrtimer, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return rapl_pmu_update(rp); } diff --git a/drivers/pps/generators/pps_gen_parport.c b/drivers/pps/generators/pps_gen_parport.c index d46eed1594951..f5eeb4dd01ad0 100644 --- a/drivers/pps/generators/pps_gen_parport.c +++ b/drivers/pps/generators/pps_gen_parport.c @@ -208,8 +208,7 @@ static void parport_attach(struct parport *port) calibrate_port(&device); - hrtimer_init(&device.timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - device.timer.function = hrtimer_event; + hrtimer_setup(&device.timer, hrtimer_event, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_start(&device.timer, next_intr_time(&device), HRTIMER_MODE_ABS); return; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 27afbb9d544b7..cbf531d0ba688 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -1742,7 +1742,8 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv, err = rio_add_net(net); if (err) { rmcd_debug(RDEV, "failed to register net, err=%d", err); - kfree(net); + put_device(&net->dev); + mport->net = NULL; goto cleanup; } } diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index fdcf742b2adbc..c12941f71e2cb 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -871,7 +871,10 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, dev_set_name(&net->dev, "rnet_%d", net->id); net->dev.parent = &mport->dev; net->dev.release = rio_scan_release_dev; - rio_add_net(net); + if (rio_add_net(net)) { + put_device(&net->dev); + net = NULL; + } } return net; diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index e31fa0ad127e9..b88cd4fb295bc 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -240,8 +240,7 @@ static struct rtc_device *rtc_allocate_device(void) /* Init uie timer */ rtc_timer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, rtc); /* Init pie timer */ - hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rtc->pie_timer.function = rtc_pie_update_irq; + hrtimer_setup(&rtc->pie_timer, rtc_pie_update_irq, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rtc->pie_enabled = 0; set_bit(RTC_FEATURE_ALARM, rtc->features); diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 26e1ea1940ecb..62feb2c639d56 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -2326,8 +2326,7 @@ static inline int __init ap_async_init(void) */ if (MACHINE_IS_VM) poll_high_timeout = 1500000; - hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ap_poll_timer.function = ap_poll_timeout; + hrtimer_setup(&ap_poll_timer, ap_poll_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); queue_work(system_long_wq, &ap_scan_bus_work); diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 16d085d56e9d7..9e42230e42b86 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -2922,9 +2922,7 @@ static long ibmvscsis_alloctimer(struct scsi_info *vscsi) struct timer_cb *p_timer; p_timer = &vscsi->rsp_q_timer; - hrtimer_init(&p_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - - p_timer->timer.function = ibmvscsis_service_wait_q; + hrtimer_setup(&p_timer->timer, ibmvscsis_service_wait_q, CLOCK_MONOTONIC, HRTIMER_MODE_REL); p_timer->started = false; p_timer->timer_pops = 0; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index bcadf11414c8a..d1ac1d1cec3c6 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -7952,11 +7952,10 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) timer_setup(&phba->fcf.redisc_wait, lpfc_sli4_fcf_redisc_wait_tmo, 0); /* CMF congestion timer */ - hrtimer_init(&phba->cmf_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - phba->cmf_timer.function = lpfc_cmf_timer; + hrtimer_setup(&phba->cmf_timer, lpfc_cmf_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); /* CMF 1 minute stats collection timer */ - hrtimer_init(&phba->cmf_stats_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - phba->cmf_stats_timer.function = lpfc_cmf_stats_timer; + hrtimer_setup(&phba->cmf_stats_timer, lpfc_cmf_stats_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Control structure for handling external multi-buffer mailbox diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 5ceaa4665e5df..fe5c30bb26398 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -6384,8 +6384,8 @@ static struct sdebug_queued_cmd *sdebug_alloc_queued_cmd(struct scsi_cmnd *scmd) sd_dp = &sqcp->sd_dp; - hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - sd_dp->hrt.function = sdebug_q_cmd_hrt_complete; + hrtimer_setup(&sd_dp->hrt, sdebug_q_cmd_hrt_complete, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete); sqcp->scmd = scmd; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index be0890e4e7062..f1cfe0bb89b20 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1669,13 +1669,6 @@ static blk_status_t scsi_prepare_cmd(struct request *req) if (in_flight) __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); - /* - * Only clear the driver-private command data if the LLD does not supply - * a function to initialize that data. - */ - if (!shost->hostt->init_cmd_priv) - memset(cmd + 1, 0, shost->hostt->cmd_size); - cmd->prot_op = SCSI_PROT_NORMAL; if (blk_rq_bytes(req)) cmd->sc_data_direction = rq_dma_dir(req); @@ -1842,6 +1835,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; + /* + * Only clear the driver-private command data if the LLD does not supply + * a function to initialize that data. + */ + if (shost->hostt->cmd_size && !shost->hostt->init_cmd_priv) + memset(cmd + 1, 0, shost->hostt->cmd_size); + if (!(req->rq_flags & RQF_DONTPREP)) { ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) diff --git a/drivers/slimbus/messaging.c b/drivers/slimbus/messaging.c index e7aa9bd4b44b8..6f01d944f9c65 100644 --- a/drivers/slimbus/messaging.c +++ b/drivers/slimbus/messaging.c @@ -148,8 +148,9 @@ int slim_do_transfer(struct slim_controller *ctrl, struct slim_msg_txn *txn) } ret = ctrl->xfer_msg(ctrl, txn); - - if (!ret && need_tid && !txn->msg->comp) { + if (ret == -ETIMEDOUT) { + slim_free_txn_tid(ctrl, txn); + } else if (!ret && need_tid && !txn->msg->comp) { unsigned long ms = txn->rl + HZ; time_left = wait_for_completion_timeout(txn->comp, diff --git a/drivers/soc/loongson/loongson2_guts.c b/drivers/soc/loongson/loongson2_guts.c index ae42e3a9127fc..16913c3ef65ca 100644 --- a/drivers/soc/loongson/loongson2_guts.c +++ b/drivers/soc/loongson/loongson2_guts.c @@ -114,8 +114,11 @@ static int loongson2_guts_probe(struct platform_device *pdev) if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); of_node_put(root); - if (machine) + if (machine) { soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); + if (!soc_dev_attr.machine) + return -ENOMEM; + } svr = loongson2_guts_get_svr(); soc_die = loongson2_soc_die_match(svr, loongson2_soc_die); diff --git a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h index 049246774cede..6146555fe9cfd 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h @@ -172,10 +172,10 @@ void atomisp_unregister_subdev(struct v4l2_subdev *subdev); #define IS_BYT __IS_SOC(INTEL_ATOM_SILVERMONT) #define IS_CHT __IS_SOC(INTEL_ATOM_AIRMONT) #define IS_MRFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID) -#define IS_MOFD __IS_SOC(INTEL_ATOM_AIRMONT_MID) +#define IS_MOFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID2) /* Both CHT and MOFD come with ISP2401 */ #define IS_ISP2401 __IS_SOCS(INTEL_ATOM_AIRMONT, \ - INTEL_ATOM_AIRMONT_MID) + INTEL_ATOM_SILVERMONT_MID2) #endif /* ATOMISP_PLATFORM_H_ */ diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c index 322a543b8c278..d0f397c902420 100644 --- a/drivers/tee/optee/supp.c +++ b/drivers/tee/optee/supp.c @@ -80,7 +80,6 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params, struct optee *optee = tee_get_drvdata(ctx->teedev); struct optee_supp *supp = &optee->supp; struct optee_supp_req *req; - bool interruptable; u32 ret; /* @@ -111,36 +110,18 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params, /* * Wait for supplicant to process and return result, once we've * returned from wait_for_completion(&req->c) successfully we have - * exclusive access again. + * exclusive access again. Allow the wait to be killable such that + * the wait doesn't turn into an indefinite state if the supplicant + * gets hung for some reason. */ - while (wait_for_completion_interruptible(&req->c)) { + if (wait_for_completion_killable(&req->c)) { mutex_lock(&supp->mutex); - interruptable = !supp->ctx; - if (interruptable) { - /* - * There's no supplicant available and since the - * supp->mutex currently is held none can - * become available until the mutex released - * again. - * - * Interrupting an RPC to supplicant is only - * allowed as a way of slightly improving the user - * experience in case the supplicant hasn't been - * started yet. During normal operation the supplicant - * will serve all requests in a timely manner and - * interrupting then wouldn't make sense. - */ - if (req->in_queue) { - list_del(&req->link); - req->in_queue = false; - } + if (req->in_queue) { + list_del(&req->link); + req->in_queue = false; } mutex_unlock(&supp->mutex); - - if (interruptable) { - req->ret = TEEC_ERROR_COMMUNICATION; - break; - } + req->ret = TEEC_ERROR_COMMUNICATION; } ret = req->ret; diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 3b644de3292e2..0d9f636c80f4d 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -370,7 +370,7 @@ static void divvy_up_power(struct power_actor *power, int num_actors, for (i = 0; i < num_actors; i++) { struct power_actor *pa = &power[i]; - u64 req_range = (u64)pa->req_power * power_range; + u64 req_range = (u64)pa->weighted_req_power * power_range; pa->granted_power = DIV_ROUND_CLOSEST_ULL(req_range, total_req_power); @@ -641,6 +641,22 @@ static int allocate_actors_buffer(struct power_allocator_params *params, return ret; } +static void power_allocator_update_weight(struct power_allocator_params *params) +{ + const struct thermal_trip_desc *td; + struct thermal_instance *instance; + + if (!params->trip_max) + return; + + td = trip_to_trip_desc(params->trip_max); + + params->total_weight = 0; + list_for_each_entry(instance, &td->thermal_instances, trip_node) + if (power_actor_is_valid(instance)) + params->total_weight += instance->weight; +} + static void power_allocator_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { @@ -656,16 +672,12 @@ static void power_allocator_update_tz(struct thermal_zone_device *tz, if (power_actor_is_valid(instance)) num_actors++; - if (num_actors == params->num_actors) - return; + if (num_actors != params->num_actors) + allocate_actors_buffer(params, num_actors); - allocate_actors_buffer(params, num_actors); - break; + fallthrough; case THERMAL_INSTANCE_WEIGHT_CHANGED: - params->total_weight = 0; - list_for_each_entry(instance, &td->thermal_instances, trip_node) - if (power_actor_is_valid(instance)) - params->total_weight += instance->weight; + power_allocator_update_weight(params); break; default: break; @@ -731,6 +743,8 @@ static int power_allocator_bind(struct thermal_zone_device *tz) tz->governor_data = params; + power_allocator_update_weight(params); + return 0; free_params: diff --git a/drivers/thermal/intel/intel_tcc.c b/drivers/thermal/intel/intel_tcc.c index 817421508d5c9..b2a615aea7c14 100644 --- a/drivers/thermal/intel/intel_tcc.c +++ b/drivers/thermal/intel/intel_tcc.c @@ -106,7 +106,7 @@ static const struct x86_cpu_id intel_tcc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &temp_broadwell), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &temp_broadwell), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &temp_goldmont), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &temp_goldmont), diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 5ab4ce4daaebd..5401f03d6b6c1 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -274,6 +274,34 @@ static bool thermal_of_get_cooling_spec(struct device_node *map_np, int index, return true; } +static bool thermal_of_cm_lookup(struct device_node *cm_np, + const struct thermal_trip *trip, + struct thermal_cooling_device *cdev, + struct cooling_spec *c) +{ + for_each_child_of_node_scoped(cm_np, child) { + struct device_node *tr_np; + int count, i; + + tr_np = of_parse_phandle(child, "trip", 0); + if (tr_np != trip->priv) + continue; + + /* The trip has been found, look up the cdev. */ + count = of_count_phandle_with_args(child, "cooling-device", + "#cooling-cells"); + if (count <= 0) + pr_err("Add a cooling_device property with at least one device\n"); + + for (i = 0; i < count; i++) { + if (thermal_of_get_cooling_spec(child, i, cdev, c)) + return true; + } + } + + return false; +} + static bool thermal_of_should_bind(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev, @@ -293,27 +321,7 @@ static bool thermal_of_should_bind(struct thermal_zone_device *tz, goto out; /* Look up the trip and the cdev in the cooling maps. */ - for_each_child_of_node_scoped(cm_np, child) { - struct device_node *tr_np; - int count, i; - - tr_np = of_parse_phandle(child, "trip", 0); - if (tr_np != trip->priv) - continue; - - /* The trip has been found, look up the cdev. */ - count = of_count_phandle_with_args(child, "cooling-device", "#cooling-cells"); - if (count <= 0) - pr_err("Add a cooling_device property with at least one device\n"); - - for (i = 0; i < count; i++) { - result = thermal_of_get_cooling_spec(child, i, cdev, c); - if (result) - break; - } - - break; - } + result = thermal_of_cm_lookup(cm_np, trip, cdev, c); of_node_put(cm_np); out: diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c index d0b18358859e1..742004d63c6f0 100644 --- a/drivers/tty/serial/8250/8250_bcm7271.c +++ b/drivers/tty/serial/8250/8250_bcm7271.c @@ -1056,8 +1056,7 @@ static int brcmuart_probe(struct platform_device *pdev) } /* setup HR timer */ - hrtimer_init(&priv->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - priv->hrt.function = brcmuart_hrtimer_func; + hrtimer_setup(&priv->hrt, brcmuart_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); up.port.shutdown = brcmuart_shutdown; up.port.startup = brcmuart_startup; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 442967a6cd52d..c57f44882abbf 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -566,12 +566,10 @@ static int serial8250_em485_init(struct uart_8250_port *p) if (!p->em485) return -ENOMEM; - hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; - p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; + hrtimer_setup(&p->em485->stop_tx_timer, &serial8250_em485_handle_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&p->em485->start_tx_timer, &serial8250_em485_handle_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); p->em485->port = p; p->em485->active_timer = NULL; p->em485->tx_stopped = true; diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 04212c823a91d..98f178bdbcbec 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2867,11 +2867,10 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id) return -EINVAL; } } - - hrtimer_init(&uap->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer_init(&uap->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - uap->trigger_start_tx.function = pl011_trigger_start_tx; - uap->trigger_stop_tx.function = pl011_trigger_stop_tx; + hrtimer_setup(&uap->trigger_start_tx, pl011_trigger_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&uap->trigger_stop_tx, pl011_trigger_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr); if (ret) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 9c59ec128bb4f..9a1afe409b989 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2582,10 +2582,10 @@ static int imx_uart_probe(struct platform_device *pdev) imx_uart_writel(sport, ucr3, UCR3); } - hrtimer_init(&sport->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer_init(&sport->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sport->trigger_start_tx.function = imx_trigger_start_tx; - sport->trigger_stop_tx.function = imx_trigger_stop_tx; + hrtimer_setup(&sport->trigger_start_tx, imx_trigger_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&sport->trigger_stop_tx, imx_trigger_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Allocate the IRQ(s) i.MX1 has three interrupts whereas later diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index b1ea48f38248e..b72c3bc19bfa1 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1702,8 +1702,7 @@ static void sci_request_dma(struct uart_port *port) dma += s->buf_len_rx; } - hrtimer_init(&s->rx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - s->rx_timer.function = sci_dma_rx_timer_fn; + hrtimer_setup(&s->rx_timer, sci_dma_rx_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); s->chan_rx_saved = s->chan_rx = chan; diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index 92ec51870d1da..fe457bf1e15bb 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -454,7 +454,7 @@ static void cdns_uart_handle_tx(void *dev_id) if (cdns_uart->port->rs485.flags & SER_RS485_ENABLED && (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))) { - cdns_uart->tx_timer.function = &cdns_rs485_rx_callback; + hrtimer_update_function(&cdns_uart->tx_timer, cdns_rs485_rx_callback); hrtimer_start(&cdns_uart->tx_timer, ns_to_ktime(cdns_calc_after_tx_delay(cdns_uart)), HRTIMER_MODE_REL); } @@ -734,7 +734,7 @@ static void cdns_uart_start_tx(struct uart_port *port) if (cdns_uart->port->rs485.flags & SER_RS485_ENABLED) { if (!cdns_uart->rs485_tx_started) { - cdns_uart->tx_timer.function = &cdns_rs485_tx_callback; + hrtimer_update_function(&cdns_uart->tx_timer, cdns_rs485_tx_callback); cdns_rs485_tx_setup(cdns_uart); return hrtimer_start(&cdns_uart->tx_timer, ms_to_ktime(port->rs485.delay_rts_before_send), @@ -1626,8 +1626,8 @@ static int cdns_rs485_config(struct uart_port *port, struct ktermios *termios, writel(val, port->membase + CDNS_UART_MODEMCR); /* Timer setup */ - hrtimer_init(&cdns_uart->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cdns_uart->tx_timer.function = &cdns_rs485_tx_callback; + hrtimer_setup(&cdns_uart->tx_timer, &cdns_rs485_tx_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Disable transmitter and make Rx setup*/ cdns_uart_stop_tx(port); diff --git a/drivers/ufs/core/ufs_bsg.c b/drivers/ufs/core/ufs_bsg.c index 8d4ad0a3f2cf0..252186124669a 100644 --- a/drivers/ufs/core/ufs_bsg.c +++ b/drivers/ufs/core/ufs_bsg.c @@ -194,10 +194,12 @@ static int ufs_bsg_request(struct bsg_job *job) ufshcd_rpm_put_sync(hba); kfree(buff); bsg_reply->result = ret; - job->reply_len = !rpmb ? sizeof(struct ufs_bsg_reply) : sizeof(struct ufs_rpmb_reply); /* complete the job here only if no error */ - if (ret == 0) + if (ret == 0) { + job->reply_len = rpmb ? sizeof(struct ufs_rpmb_reply) : + sizeof(struct ufs_bsg_reply); bsg_job_done(job, ret, bsg_reply->reply_payload_rcv_len); + } return ret; } diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 1893a7ad95316..464f13da259aa 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -266,7 +266,7 @@ static bool ufshcd_has_pending_tasks(struct ufs_hba *hba) static bool ufshcd_is_ufs_dev_busy(struct ufs_hba *hba) { - return hba->outstanding_reqs || ufshcd_has_pending_tasks(hba); + return scsi_host_busy(hba->host) || ufshcd_has_pending_tasks(hba); } static const struct ufs_dev_quirk ufs_fixups[] = { @@ -628,8 +628,8 @@ static void ufshcd_print_host_state(struct ufs_hba *hba) const struct scsi_device *sdev_ufs = hba->ufs_device_wlun; dev_err(hba->dev, "UFS Host state=%d\n", hba->ufshcd_state); - dev_err(hba->dev, "outstanding reqs=0x%lx tasks=0x%lx\n", - hba->outstanding_reqs, hba->outstanding_tasks); + dev_err(hba->dev, "%d outstanding reqs, tasks=0x%lx\n", + scsi_host_busy(hba->host), hba->outstanding_tasks); dev_err(hba->dev, "saved_err=0x%x, saved_uic_err=0x%x\n", hba->saved_err, hba->saved_uic_err); dev_err(hba->dev, "Device power mode=%d, UIC link state=%d\n", @@ -8882,7 +8882,7 @@ static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n", __func__, hba->outstanding_tasks); - return hba->outstanding_reqs ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; + return scsi_host_busy(hba->host) ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; } static const struct attribute_group *ufshcd_driver_groups[] = { @@ -10431,6 +10431,21 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) */ spin_lock_init(&hba->clk_gating.lock); + /* + * Set the default power management level for runtime and system PM. + * Host controller drivers can override them in their + * 'ufs_hba_variant_ops::init' callback. + * + * Default power saving mode is to keep UFS link in Hibern8 state + * and UFS device in sleep state. + */ + hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + UFS_SLEEP_PWR_MODE, + UIC_LINK_HIBERN8_STATE); + hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + UFS_SLEEP_PWR_MODE, + UIC_LINK_HIBERN8_STATE); + err = ufshcd_hba_init(hba); if (err) goto out_error; @@ -10544,21 +10559,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) goto out_disable; } - /* - * Set the default power management level for runtime and system PM if - * not set by the host controller drivers. - * Default power saving mode is to keep UFS link in Hibern8 state - * and UFS device in sleep state. - */ - if (!hba->rpm_lvl) - hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( - UFS_SLEEP_PWR_MODE, - UIC_LINK_HIBERN8_STATE); - if (!hba->spm_lvl) - hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( - UFS_SLEEP_PWR_MODE, - UIC_LINK_HIBERN8_STATE); - INIT_DELAYED_WORK(&hba->rpm_dev_flush_recheck_work, ufshcd_rpm_dev_flush_recheck_work); INIT_DELAYED_WORK(&hba->ufs_rtc_update_work, ufshcd_rtc_work); diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c index 0dd85d2635b99..47d06af33747d 100644 --- a/drivers/usb/atm/cxacru.c +++ b/drivers/usb/atm/cxacru.c @@ -1131,7 +1131,10 @@ static int cxacru_bind(struct usbatm_data *usbatm_instance, struct cxacru_data *instance; struct usb_device *usb_dev = interface_to_usbdev(intf); struct usb_host_endpoint *cmd_ep = usb_dev->ep_in[CXACRU_EP_CMD]; - struct usb_endpoint_descriptor *in, *out; + static const u8 ep_addrs[] = { + CXACRU_EP_CMD + USB_DIR_IN, + CXACRU_EP_CMD + USB_DIR_OUT, + 0}; int ret; /* instance init */ @@ -1179,13 +1182,11 @@ static int cxacru_bind(struct usbatm_data *usbatm_instance, } if (usb_endpoint_xfer_int(&cmd_ep->desc)) - ret = usb_find_common_endpoints(intf->cur_altsetting, - NULL, NULL, &in, &out); + ret = usb_check_int_endpoints(intf, ep_addrs); else - ret = usb_find_common_endpoints(intf->cur_altsetting, - &in, &out, NULL, NULL); + ret = usb_check_bulk_endpoints(intf, ep_addrs); - if (ret) { + if (!ret) { usb_err(usbatm_instance, "cxacru_bind: interface has incorrect endpoints\n"); ret = -ENODEV; goto fail; diff --git a/drivers/usb/chipidea/otg_fsm.c b/drivers/usb/chipidea/otg_fsm.c index c17516c29b63b..a093544482d5c 100644 --- a/drivers/usb/chipidea/otg_fsm.c +++ b/drivers/usb/chipidea/otg_fsm.c @@ -424,8 +424,7 @@ static enum hrtimer_restart ci_otg_hrtimer_func(struct hrtimer *t) /* Initialize timers */ static int ci_otg_init_timers(struct ci_hdrc *ci) { - hrtimer_init(&ci->otg_fsm_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ci->otg_fsm_hrtimer.function = ci_otg_hrtimer_func; + hrtimer_setup(&ci->otg_fsm_hrtimer, ci_otg_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); return 0; } diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index a76bb50b62026..dcba4281ea486 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -6065,6 +6065,36 @@ void usb_hub_cleanup(void) usb_deregister(&hub_driver); } /* usb_hub_cleanup() */ +/** + * hub_hc_release_resources - clear resources used by host controller + * @udev: pointer to device being released + * + * Context: task context, might sleep + * + * Function releases the host controller resources in correct order before + * making any operation on resuming usb device. The host controller resources + * allocated for devices in tree should be released starting from the last + * usb device in tree toward the root hub. This function is used only during + * resuming device when usb device require reinitialization – that is, when + * flag udev->reset_resume is set. + * + * This call is synchronous, and may not be used in an interrupt context. + */ +static void hub_hc_release_resources(struct usb_device *udev) +{ + struct usb_hub *hub = usb_hub_to_struct_hub(udev); + struct usb_hcd *hcd = bus_to_hcd(udev->bus); + int i; + + /* Release up resources for all children before this device */ + for (i = 0; i < udev->maxchild; i++) + if (hub->ports[i]->child) + hub_hc_release_resources(hub->ports[i]->child); + + if (hcd->driver->reset_device) + hcd->driver->reset_device(hcd, udev); +} + /** * usb_reset_and_verify_device - perform a USB port reset to reinitialize a device * @udev: device to reset (not in SUSPENDED or NOTATTACHED state) @@ -6129,6 +6159,9 @@ static int usb_reset_and_verify_device(struct usb_device *udev) bos = udev->bos; udev->bos = NULL; + if (udev->reset_resume) + hub_hc_release_resources(udev); + mutex_lock(hcd->address0_mutex); for (i = 0; i < PORT_INIT_TRIES; ++i) { diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index dfcfc142bd5e1..8efbacc5bc341 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -341,6 +341,10 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0638, 0x0a13), .driver_info = USB_QUIRK_STRING_FETCH_255 }, + /* Prolific Single-LUN Mass Storage Card Reader */ + { USB_DEVICE(0x067b, 0x2731), .driver_info = USB_QUIRK_DELAY_INIT | + USB_QUIRK_NO_LPM }, + /* Saitek Cyborg Gold Joystick */ { USB_DEVICE(0x06a3, 0x0006), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c index 238c6fd50e75a..2a542a99ec447 100644 --- a/drivers/usb/dwc2/hcd_queue.c +++ b/drivers/usb/dwc2/hcd_queue.c @@ -1459,8 +1459,7 @@ static void dwc2_qh_init(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh, /* Initialize QH */ qh->hsotg = hsotg; timer_setup(&qh->unreserve_timer, dwc2_unreserve_timer_fn, 0); - hrtimer_init(&qh->wait_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - qh->wait_timer.function = &dwc2_wait_timer_fn; + hrtimer_setup(&qh->wait_timer, &dwc2_wait_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); qh->ep_type = ep_type; qh->ep_is_in = ep_is_in; diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index dfa1b5fe48dc4..66a08b5271653 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -131,11 +131,24 @@ void dwc3_enable_susphy(struct dwc3 *dwc, bool enable) } } -void dwc3_set_prtcap(struct dwc3 *dwc, u32 mode) +void dwc3_set_prtcap(struct dwc3 *dwc, u32 mode, bool ignore_susphy) { + unsigned int hw_mode; u32 reg; reg = dwc3_readl(dwc->regs, DWC3_GCTL); + + /* + * For DRD controllers, GUSB3PIPECTL.SUSPENDENABLE and + * GUSB2PHYCFG.SUSPHY should be cleared during mode switching, + * and they can be set after core initialization. + */ + hw_mode = DWC3_GHWPARAMS0_MODE(dwc->hwparams.hwparams0); + if (hw_mode == DWC3_GHWPARAMS0_MODE_DRD && !ignore_susphy) { + if (DWC3_GCTL_PRTCAP(reg) != mode) + dwc3_enable_susphy(dwc, false); + } + reg &= ~(DWC3_GCTL_PRTCAPDIR(DWC3_GCTL_PRTCAP_OTG)); reg |= DWC3_GCTL_PRTCAPDIR(mode); dwc3_writel(dwc->regs, DWC3_GCTL, reg); @@ -216,7 +229,7 @@ static void __dwc3_set_mode(struct work_struct *work) spin_lock_irqsave(&dwc->lock, flags); - dwc3_set_prtcap(dwc, desired_dr_role); + dwc3_set_prtcap(dwc, desired_dr_role, false); spin_unlock_irqrestore(&dwc->lock, flags); @@ -658,16 +671,7 @@ static int dwc3_ss_phy_setup(struct dwc3 *dwc, int index) */ reg &= ~DWC3_GUSB3PIPECTL_UX_EXIT_PX; - /* - * Above DWC_usb3.0 1.94a, it is recommended to set - * DWC3_GUSB3PIPECTL_SUSPHY to '0' during coreConsultant configuration. - * So default value will be '0' when the core is reset. Application - * needs to set it to '1' after the core initialization is completed. - * - * Similarly for DRD controllers, GUSB3PIPECTL.SUSPENDENABLE must be - * cleared after power-on reset, and it can be set after core - * initialization. - */ + /* Ensure the GUSB3PIPECTL.SUSPENDENABLE is cleared prior to phy init. */ reg &= ~DWC3_GUSB3PIPECTL_SUSPHY; if (dwc->u2ss_inp3_quirk) @@ -747,15 +751,7 @@ static int dwc3_hs_phy_setup(struct dwc3 *dwc, int index) break; } - /* - * Above DWC_usb3.0 1.94a, it is recommended to set - * DWC3_GUSB2PHYCFG_SUSPHY to '0' during coreConsultant configuration. - * So default value will be '0' when the core is reset. Application - * needs to set it to '1' after the core initialization is completed. - * - * Similarly for DRD controllers, GUSB2PHYCFG.SUSPHY must be cleared - * after power-on reset, and it can be set after core initialization. - */ + /* Ensure the GUSB2PHYCFG.SUSPHY is cleared prior to phy init. */ reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; if (dwc->dis_enblslpm_quirk) @@ -830,6 +826,25 @@ static int dwc3_phy_init(struct dwc3 *dwc) goto err_exit_usb3_phy; } + /* + * Above DWC_usb3.0 1.94a, it is recommended to set + * DWC3_GUSB3PIPECTL_SUSPHY and DWC3_GUSB2PHYCFG_SUSPHY to '0' during + * coreConsultant configuration. So default value will be '0' when the + * core is reset. Application needs to set it to '1' after the core + * initialization is completed. + * + * Certain phy requires to be in P0 power state during initialization. + * Make sure GUSB3PIPECTL.SUSPENDENABLE and GUSB2PHYCFG.SUSPHY are clear + * prior to phy init to maintain in the P0 state. + * + * After phy initialization, some phy operations can only be executed + * while in lower P states. Ensure GUSB3PIPECTL.SUSPENDENABLE and + * GUSB2PHYCFG.SUSPHY are set soon after initialization to avoid + * blocking phy ops. + */ + if (!DWC3_VER_IS_WITHIN(DWC3, ANY, 194A)) + dwc3_enable_susphy(dwc, true); + return 0; err_exit_usb3_phy: @@ -1588,7 +1603,7 @@ static int dwc3_core_init_mode(struct dwc3 *dwc) switch (dwc->dr_mode) { case USB_DR_MODE_PERIPHERAL: - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE, false); if (dwc->usb2_phy) otg_set_vbus(dwc->usb2_phy->otg, false); @@ -1600,7 +1615,7 @@ static int dwc3_core_init_mode(struct dwc3 *dwc) return dev_err_probe(dev, ret, "failed to initialize gadget\n"); break; case USB_DR_MODE_HOST: - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_HOST); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_HOST, false); if (dwc->usb2_phy) otg_set_vbus(dwc->usb2_phy->otg, true); @@ -1645,7 +1660,7 @@ static void dwc3_core_exit_mode(struct dwc3 *dwc) } /* de-assert DRVVBUS for HOST and OTG mode */ - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE, true); } static void dwc3_get_software_properties(struct dwc3 *dwc) @@ -1835,8 +1850,6 @@ static void dwc3_get_properties(struct dwc3 *dwc) dwc->tx_thr_num_pkt_prd = tx_thr_num_pkt_prd; dwc->tx_max_burst_prd = tx_max_burst_prd; - dwc->imod_interval = 0; - dwc->tx_fifo_resize_max_num = tx_fifo_resize_max_num; } @@ -1854,21 +1867,19 @@ static void dwc3_check_params(struct dwc3 *dwc) unsigned int hwparam_gen = DWC3_GHWPARAMS3_SSPHY_IFC(dwc->hwparams.hwparams3); - /* Check for proper value of imod_interval */ - if (dwc->imod_interval && !dwc3_has_imod(dwc)) { - dev_warn(dwc->dev, "Interrupt moderation not supported\n"); - dwc->imod_interval = 0; - } - /* + * Enable IMOD for all supporting controllers. + * + * Particularly, DWC_usb3 v3.00a must enable this feature for + * the following reason: + * * Workaround for STAR 9000961433 which affects only version * 3.00a of the DWC_usb3 core. This prevents the controller * interrupt from being masked while handling events. IMOD * allows us to work around this issue. Enable it for the * affected version. */ - if (!dwc->imod_interval && - DWC3_VER_IS(DWC3, 300A)) + if (dwc3_has_imod((dwc))) dwc->imod_interval = 1; /* Check the maximum_speed parameter */ @@ -2457,7 +2468,7 @@ static int dwc3_resume_common(struct dwc3 *dwc, pm_message_t msg) if (ret) return ret; - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE, true); dwc3_gadget_resume(dwc); break; case DWC3_GCTL_PRTCAP_HOST: @@ -2465,7 +2476,7 @@ static int dwc3_resume_common(struct dwc3 *dwc, pm_message_t msg) ret = dwc3_core_init_for_resume(dwc); if (ret) return ret; - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_HOST); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_HOST, true); break; } /* Restore GUSB2PHYCFG bits that were modified in suspend */ @@ -2494,7 +2505,7 @@ static int dwc3_resume_common(struct dwc3 *dwc, pm_message_t msg) if (ret) return ret; - dwc3_set_prtcap(dwc, dwc->current_dr_role); + dwc3_set_prtcap(dwc, dwc->current_dr_role, true); dwc3_otg_init(dwc); if (dwc->current_otg_role == DWC3_OTG_ROLE_HOST) { diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index c955039bb4f62..aaa39e663f60a 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -1558,7 +1558,7 @@ struct dwc3_gadget_ep_cmd_params { #define DWC3_HAS_OTG BIT(3) /* prototypes */ -void dwc3_set_prtcap(struct dwc3 *dwc, u32 mode); +void dwc3_set_prtcap(struct dwc3 *dwc, u32 mode, bool ignore_susphy); void dwc3_set_mode(struct dwc3 *dwc, u32 mode); u32 dwc3_core_fifo_space(struct dwc3_ep *dep, u8 type); diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c index d76ae676783cf..7977860932b14 100644 --- a/drivers/usb/dwc3/drd.c +++ b/drivers/usb/dwc3/drd.c @@ -173,7 +173,7 @@ void dwc3_otg_init(struct dwc3 *dwc) * block "Initialize GCTL for OTG operation". */ /* GCTL.PrtCapDir=2'b11 */ - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_OTG); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_OTG, true); /* GUSB2PHYCFG0.SusPHY=0 */ reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; @@ -556,7 +556,7 @@ int dwc3_drd_init(struct dwc3 *dwc) dwc3_drd_update(dwc); } else { - dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_OTG); + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_OTG, true); /* use OTG block to get ID event */ irq = dwc3_otg_get_irq(dwc); diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index ddd6b2ce57107..89a4dc8ebf948 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4501,14 +4501,18 @@ static irqreturn_t dwc3_process_event_buf(struct dwc3_event_buffer *evt) dwc3_writel(dwc->regs, DWC3_GEVNTSIZ(0), DWC3_GEVNTSIZ_SIZE(evt->length)); + evt->flags &= ~DWC3_EVENT_PENDING; + /* + * Add an explicit write memory barrier to make sure that the update of + * clearing DWC3_EVENT_PENDING is observed in dwc3_check_event_buf() + */ + wmb(); + if (dwc->imod_interval) { dwc3_writel(dwc->regs, DWC3_GEVNTCOUNT(0), DWC3_GEVNTCOUNT_EHB); dwc3_writel(dwc->regs, DWC3_DEV_IMOD(0), dwc->imod_interval); } - /* Keep the clearing of DWC3_EVENT_PENDING at the end */ - evt->flags &= ~DWC3_EVENT_PENDING; - return ret; } diff --git a/drivers/usb/fotg210/fotg210-hcd.c b/drivers/usb/fotg210/fotg210-hcd.c index 3d404d19a205e..64c4965a160f4 100644 --- a/drivers/usb/fotg210/fotg210-hcd.c +++ b/drivers/usb/fotg210/fotg210-hcd.c @@ -4901,8 +4901,7 @@ static int hcd_fotg210_init(struct usb_hcd *hcd) */ fotg210->need_io_watchdog = 1; - hrtimer_init(&fotg210->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - fotg210->hrtimer.function = fotg210_hrtimer_func; + hrtimer_setup(&fotg210->hrtimer, fotg210_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); fotg210->next_hrtimer_event = FOTG210_HRTIMER_NO_EVENT; hcc_params = fotg210_readl(fotg210, &fotg210->caps->hcc_params); diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index bdda8c74602de..869ad99afb48b 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1050,10 +1050,11 @@ static int set_config(struct usb_composite_dev *cdev, else usb_gadget_set_remote_wakeup(gadget, 0); done: - if (power <= USB_SELF_POWER_VBUS_MAX_DRAW) - usb_gadget_set_selfpowered(gadget); - else + if (power > USB_SELF_POWER_VBUS_MAX_DRAW || + (c && !(c->bmAttributes & USB_CONFIG_ATT_SELFPOWER))) usb_gadget_clear_selfpowered(gadget); + else + usb_gadget_set_selfpowered(gadget); usb_gadget_vbus_draw(gadget, power); if (result >= 0 && cdev->delayed_status) @@ -2615,7 +2616,10 @@ void composite_suspend(struct usb_gadget *gadget) cdev->suspended = 1; - usb_gadget_set_selfpowered(gadget); + if (cdev->config && + cdev->config->bmAttributes & USB_CONFIG_ATT_SELFPOWER) + usb_gadget_set_selfpowered(gadget); + usb_gadget_vbus_draw(gadget, 2); } @@ -2649,8 +2653,11 @@ void composite_resume(struct usb_gadget *gadget) else maxpower = min(maxpower, 900U); - if (maxpower > USB_SELF_POWER_VBUS_MAX_DRAW) + if (maxpower > USB_SELF_POWER_VBUS_MAX_DRAW || + !(cdev->config->bmAttributes & USB_CONFIG_ATT_SELFPOWER)) usb_gadget_clear_selfpowered(gadget); + else + usb_gadget_set_selfpowered(gadget); usb_gadget_vbus_draw(gadget, maxpower); } else { diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index f60576a65ca67..58b0dd575af32 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1559,8 +1559,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) ncm->port.open = ncm_open; ncm->port.close = ncm_close; - hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - ncm->task_timer.function = ncm_tx_timeout; + hrtimer_setup(&ncm->task_timer, ncm_tx_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); DBG(cdev, "CDC Network: IN/%s OUT/%s NOTIFY/%s\n", ncm->port.in_ep->name, ncm->port.out_ep->name, diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 09e2838917e29..f58590bf5e02f 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -1052,8 +1052,8 @@ void gether_suspend(struct gether *link) * There is a transfer in progress. So we trigger a remote * wakeup to inform the host. */ - ether_wakeup_host(dev->port_usb); - return; + if (!ether_wakeup_host(dev->port_usb)) + return; } spin_lock_irqsave(&dev->lock, flags); link->is_suspend = true; diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index bda08c5ba7c01..4f1b5db51ddae 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -2479,8 +2479,7 @@ static DEVICE_ATTR_RO(urbs); static int dummy_start_ss(struct dummy_hcd *dum_hcd) { - hrtimer_init(&dum_hcd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - dum_hcd->timer.function = dummy_timer; + hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dum_hcd->rh_state = DUMMY_RH_RUNNING; dum_hcd->stream_en_ep = 0; INIT_LIST_HEAD(&dum_hcd->urbp_list); @@ -2509,8 +2508,7 @@ static int dummy_start(struct usb_hcd *hcd) return dummy_start_ss(dum_hcd); spin_lock_init(&dum_hcd->dum->lock); - hrtimer_init(&dum_hcd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - dum_hcd->timer.function = dummy_timer; + hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dum_hcd->rh_state = DUMMY_RH_RUNNING; INIT_LIST_HEAD(&dum_hcd->urbp_list); diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 6de79ac5e6a44..6d1d190c914d4 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -466,8 +466,7 @@ static int ehci_init(struct usb_hcd *hcd) */ ehci->need_io_watchdog = 1; - hrtimer_init(&ehci->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ehci->hrtimer.function = ehci_hrtimer_func; + hrtimer_setup(&ehci->hrtimer, ehci_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ehci->next_hrtimer_event = EHCI_HRTIMER_NO_EVENT; hcc_params = ehci_readl(ehci, &ehci->caps->hcc_params); diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 9693464c05204..69c278b64084b 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "xhci.h" #include "xhci-trace.h" @@ -770,9 +771,16 @@ static int xhci_exit_test_mode(struct xhci_hcd *xhci) enum usb_link_tunnel_mode xhci_port_is_tunneled(struct xhci_hcd *xhci, struct xhci_port *port) { + struct usb_hcd *hcd; void __iomem *base; u32 offset; + /* Don't try and probe this capability for non-Intel hosts */ + hcd = xhci_to_hcd(xhci); + if (!dev_is_pci(hcd->self.controller) || + to_pci_dev(hcd->self.controller)->vendor != PCI_VENDOR_ID_INTEL) + return USB_LINK_UNKNOWN; + base = &xhci->cap_regs->hc_capbase; offset = xhci_find_next_ext_cap(base, 0, XHCI_EXT_CAPS_INTEL_SPR_SHADOW); diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 92703efda1f7b..fdf0c1008225a 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -2437,7 +2437,8 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) * and our use of dma addresses in the trb_address_map radix tree needs * TRB_SEGMENT_SIZE alignment, so we pick the greater alignment need. */ - if (xhci->quirks & XHCI_ZHAOXIN_TRB_FETCH) + if (xhci->quirks & XHCI_TRB_OVERFETCH) + /* Buggy HC prefetches beyond segment bounds - allocate dummy space at the end */ xhci->segment_pool = dma_pool_create("xHCI ring segments", dev, TRB_SEGMENT_SIZE * 2, TRB_SEGMENT_SIZE * 2, xhci->page_size * 2); else diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index ad0ff356f6fa0..54460d11f7ee8 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -38,6 +38,8 @@ #define PCI_DEVICE_ID_ETRON_EJ168 0x7023 #define PCI_DEVICE_ID_ETRON_EJ188 0x7052 +#define PCI_DEVICE_ID_VIA_VL805 0x3483 + #define PCI_DEVICE_ID_INTEL_LYNXPOINT_XHCI 0x8c31 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 #define PCI_DEVICE_ID_INTEL_WILDCATPOINT_LP_XHCI 0x9cb1 @@ -418,8 +420,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == 0x3432) xhci->quirks |= XHCI_BROKEN_STREAMS; - if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) + if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == PCI_DEVICE_ID_VIA_VL805) { xhci->quirks |= XHCI_LPM_SUPPORT; + xhci->quirks |= XHCI_TRB_OVERFETCH; + } if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA && pdev->device == PCI_DEVICE_ID_ASMEDIA_1042_XHCI) { @@ -467,11 +471,11 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) if (pdev->device == 0x9202) { xhci->quirks |= XHCI_RESET_ON_RESUME; - xhci->quirks |= XHCI_ZHAOXIN_TRB_FETCH; + xhci->quirks |= XHCI_TRB_OVERFETCH; } if (pdev->device == 0x9203) - xhci->quirks |= XHCI_ZHAOXIN_TRB_FETCH; + xhci->quirks |= XHCI_TRB_OVERFETCH; } if (pdev->vendor == PCI_VENDOR_ID_CDNS && diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 45653114ccd7f..1a90ebc8a30ea 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -780,8 +780,12 @@ static void xhci_clear_command_ring(struct xhci_hcd *xhci) struct xhci_segment *seg; ring = xhci->cmd_ring; - xhci_for_each_ring_seg(ring->first_seg, seg) + xhci_for_each_ring_seg(ring->first_seg, seg) { + /* erase all TRBs before the link */ memset(seg->trbs, 0, sizeof(union xhci_trb) * (TRBS_PER_SEGMENT - 1)); + /* clear link cycle bit */ + seg->trbs[TRBS_PER_SEGMENT - 1].link.control &= cpu_to_le32(~TRB_CYCLE); + } xhci_initialize_ring_info(ring); /* diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 8c164340a2c35..779b01dee068f 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1632,7 +1632,7 @@ struct xhci_hcd { #define XHCI_EP_CTX_BROKEN_DCS BIT_ULL(42) #define XHCI_SUSPEND_RESUME_CLKS BIT_ULL(43) #define XHCI_RESET_TO_DEFAULT BIT_ULL(44) -#define XHCI_ZHAOXIN_TRB_FETCH BIT_ULL(45) +#define XHCI_TRB_OVERFETCH BIT_ULL(45) #define XHCI_ZHAOXIN_HOST BIT_ULL(46) #define XHCI_WRITE_64_HI_LO BIT_ULL(47) #define XHCI_CDNS_SCTX_QUIRK BIT_ULL(48) diff --git a/drivers/usb/musb/musb_cppi41.c b/drivers/usb/musb/musb_cppi41.c index 9589243e89518..4cde3abb7006f 100644 --- a/drivers/usb/musb/musb_cppi41.c +++ b/drivers/usb/musb/musb_cppi41.c @@ -760,8 +760,8 @@ cppi41_dma_controller_create(struct musb *musb, void __iomem *base) if (!controller) goto kzalloc_fail; - hrtimer_init(&controller->early_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - controller->early_tx.function = cppi41_recheck_tx_req; + hrtimer_setup(&controller->early_tx, cppi41_recheck_tx_req, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); INIT_LIST_HEAD(&controller->early_tx_list); controller->controller.channel_alloc = cppi41_dma_channel_allocate; diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c index 935fc496fe94b..4b35ef216125c 100644 --- a/drivers/usb/renesas_usbhs/common.c +++ b/drivers/usb/renesas_usbhs/common.c @@ -312,8 +312,10 @@ static int usbhsc_clk_get(struct device *dev, struct usbhs_priv *priv) priv->clks[1] = of_clk_get(dev_of_node(dev), 1); if (PTR_ERR(priv->clks[1]) == -ENOENT) priv->clks[1] = NULL; - else if (IS_ERR(priv->clks[1])) + else if (IS_ERR(priv->clks[1])) { + clk_put(priv->clks[0]); return PTR_ERR(priv->clks[1]); + } return 0; } @@ -779,6 +781,8 @@ static void usbhs_remove(struct platform_device *pdev) dev_dbg(&pdev->dev, "usb remove\n"); + flush_delayed_work(&priv->notify_hotplug_work); + /* power off */ if (!usbhs_get_dparam(priv, runtime_pwctrl)) usbhsc_power_ctrl(priv, 0); diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c index 105132ae87acb..e8e5723f54122 100644 --- a/drivers/usb/renesas_usbhs/mod_gadget.c +++ b/drivers/usb/renesas_usbhs/mod_gadget.c @@ -1094,7 +1094,7 @@ int usbhs_mod_gadget_probe(struct usbhs_priv *priv) goto usbhs_mod_gadget_probe_err_gpriv; } - gpriv->transceiver = usb_get_phy(USB_PHY_TYPE_UNDEFINED); + gpriv->transceiver = devm_usb_get_phy(dev, USB_PHY_TYPE_UNDEFINED); dev_info(dev, "%stransceiver found\n", !IS_ERR(gpriv->transceiver) ? "" : "no "); diff --git a/drivers/usb/typec/tcpm/tcpci_rt1711h.c b/drivers/usb/typec/tcpm/tcpci_rt1711h.c index 64f6dd0dc6609..88c50b984e8a3 100644 --- a/drivers/usb/typec/tcpm/tcpci_rt1711h.c +++ b/drivers/usb/typec/tcpm/tcpci_rt1711h.c @@ -334,6 +334,11 @@ static int rt1711h_probe(struct i2c_client *client) { int ret; struct rt1711h_chip *chip; + const u16 alert_mask = TCPC_ALERT_TX_SUCCESS | TCPC_ALERT_TX_DISCARDED | + TCPC_ALERT_TX_FAILED | TCPC_ALERT_RX_HARD_RST | + TCPC_ALERT_RX_STATUS | TCPC_ALERT_POWER_STATUS | + TCPC_ALERT_CC_STATUS | TCPC_ALERT_RX_BUF_OVF | + TCPC_ALERT_FAULT; chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); if (!chip) @@ -382,6 +387,12 @@ static int rt1711h_probe(struct i2c_client *client) dev_name(chip->dev), chip); if (ret < 0) return ret; + + /* Enable alert interrupts */ + ret = rt1711h_write16(chip, TCPC_ALERT_MASK, alert_mask); + if (ret < 0) + return ret; + enable_irq_wake(client->irq); return 0; diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 6bf1a22c785af..9c455f0732330 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -7721,14 +7721,14 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) kthread_init_work(&port->event_work, tcpm_pd_event_handler); kthread_init_work(&port->enable_frs, tcpm_enable_frs_work); kthread_init_work(&port->send_discover_work, tcpm_send_discover_work); - hrtimer_init(&port->state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->state_machine_timer.function = state_machine_timer_handler; - hrtimer_init(&port->vdm_state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->vdm_state_machine_timer.function = vdm_state_machine_timer_handler; - hrtimer_init(&port->enable_frs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->enable_frs_timer.function = enable_frs_timer_handler; - hrtimer_init(&port->send_discover_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->send_discover_timer.function = send_discover_timer_handler; + hrtimer_setup(&port->state_machine_timer, state_machine_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&port->vdm_state_machine_timer, vdm_state_machine_timer_handler, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup(&port->enable_frs_timer, enable_frs_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&port->send_discover_timer, send_discover_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); spin_lock_init(&port->pd_event_lock); diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index fcf499cc9458c..2a2915b0a645f 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -25,7 +25,7 @@ * difficult to estimate the time it takes for the system to process the command * before it is actually passed to the PPM. */ -#define UCSI_TIMEOUT_MS 5000 +#define UCSI_TIMEOUT_MS 10000 /* * UCSI_SWAP_TIMEOUT_MS - Timeout for role swap requests @@ -1346,7 +1346,7 @@ static int ucsi_reset_ppm(struct ucsi *ucsi) mutex_lock(&ucsi->ppm_lock); - ret = ucsi->ops->read_cci(ucsi, &cci); + ret = ucsi->ops->poll_cci(ucsi, &cci); if (ret < 0) goto out; @@ -1364,7 +1364,7 @@ static int ucsi_reset_ppm(struct ucsi *ucsi) tmo = jiffies + msecs_to_jiffies(UCSI_TIMEOUT_MS); do { - ret = ucsi->ops->read_cci(ucsi, &cci); + ret = ucsi->ops->poll_cci(ucsi, &cci); if (ret < 0) goto out; if (cci & UCSI_CCI_COMMAND_COMPLETE) @@ -1393,7 +1393,7 @@ static int ucsi_reset_ppm(struct ucsi *ucsi) /* Give the PPM time to process a reset before reading CCI */ msleep(20); - ret = ucsi->ops->read_cci(ucsi, &cci); + ret = ucsi->ops->poll_cci(ucsi, &cci); if (ret) goto out; @@ -1825,11 +1825,11 @@ static int ucsi_init(struct ucsi *ucsi) err_unregister: for (con = connector; con->port; con++) { + if (con->wq) + destroy_workqueue(con->wq); ucsi_unregister_partner(con); ucsi_unregister_altmodes(con, UCSI_RECIPIENT_CON); ucsi_unregister_port_psy(con); - if (con->wq) - destroy_workqueue(con->wq); usb_power_delivery_unregister_capabilities(con->port_sink_caps); con->port_sink_caps = NULL; @@ -1929,8 +1929,8 @@ struct ucsi *ucsi_create(struct device *dev, const struct ucsi_operations *ops) struct ucsi *ucsi; if (!ops || - !ops->read_version || !ops->read_cci || !ops->read_message_in || - !ops->sync_control || !ops->async_control) + !ops->read_version || !ops->read_cci || !ops->poll_cci || + !ops->read_message_in || !ops->sync_control || !ops->async_control) return ERR_PTR(-EINVAL); ucsi = kzalloc(sizeof(*ucsi), GFP_KERNEL); @@ -2013,10 +2013,6 @@ void ucsi_unregister(struct ucsi *ucsi) for (i = 0; i < ucsi->cap.num_connectors; i++) { cancel_work_sync(&ucsi->connector[i].work); - ucsi_unregister_partner(&ucsi->connector[i]); - ucsi_unregister_altmodes(&ucsi->connector[i], - UCSI_RECIPIENT_CON); - ucsi_unregister_port_psy(&ucsi->connector[i]); if (ucsi->connector[i].wq) { struct ucsi_work *uwork; @@ -2032,6 +2028,11 @@ void ucsi_unregister(struct ucsi *ucsi) destroy_workqueue(ucsi->connector[i].wq); } + ucsi_unregister_partner(&ucsi->connector[i]); + ucsi_unregister_altmodes(&ucsi->connector[i], + UCSI_RECIPIENT_CON); + ucsi_unregister_port_psy(&ucsi->connector[i]); + usb_power_delivery_unregister_capabilities(ucsi->connector[i].port_sink_caps); ucsi->connector[i].port_sink_caps = NULL; usb_power_delivery_unregister_capabilities(ucsi->connector[i].port_source_caps); diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h index 82735eb34f0e3..28780acc4af2e 100644 --- a/drivers/usb/typec/ucsi/ucsi.h +++ b/drivers/usb/typec/ucsi/ucsi.h @@ -62,6 +62,7 @@ struct dentry; * struct ucsi_operations - UCSI I/O operations * @read_version: Read implemented UCSI version * @read_cci: Read CCI register + * @poll_cci: Read CCI register while polling with notifications disabled * @read_message_in: Read message data from UCSI * @sync_control: Blocking control operation * @async_control: Non-blocking control operation @@ -76,6 +77,7 @@ struct dentry; struct ucsi_operations { int (*read_version)(struct ucsi *ucsi, u16 *version); int (*read_cci)(struct ucsi *ucsi, u32 *cci); + int (*poll_cci)(struct ucsi *ucsi, u32 *cci); int (*read_message_in)(struct ucsi *ucsi, void *val, size_t val_len); int (*sync_control)(struct ucsi *ucsi, u64 command); int (*async_control)(struct ucsi *ucsi, u64 command); diff --git a/drivers/usb/typec/ucsi/ucsi_acpi.c b/drivers/usb/typec/ucsi/ucsi_acpi.c index 5c55155519634..ac1ebb5d95272 100644 --- a/drivers/usb/typec/ucsi/ucsi_acpi.c +++ b/drivers/usb/typec/ucsi/ucsi_acpi.c @@ -59,19 +59,24 @@ static int ucsi_acpi_read_version(struct ucsi *ucsi, u16 *version) static int ucsi_acpi_read_cci(struct ucsi *ucsi, u32 *cci) { struct ucsi_acpi *ua = ucsi_get_drvdata(ucsi); - int ret; - - if (UCSI_COMMAND(ua->cmd) == UCSI_PPM_RESET) { - ret = ucsi_acpi_dsm(ua, UCSI_DSM_FUNC_READ); - if (ret) - return ret; - } memcpy(cci, ua->base + UCSI_CCI, sizeof(*cci)); return 0; } +static int ucsi_acpi_poll_cci(struct ucsi *ucsi, u32 *cci) +{ + struct ucsi_acpi *ua = ucsi_get_drvdata(ucsi); + int ret; + + ret = ucsi_acpi_dsm(ua, UCSI_DSM_FUNC_READ); + if (ret) + return ret; + + return ucsi_acpi_read_cci(ucsi, cci); +} + static int ucsi_acpi_read_message_in(struct ucsi *ucsi, void *val, size_t val_len) { struct ucsi_acpi *ua = ucsi_get_drvdata(ucsi); @@ -94,6 +99,7 @@ static int ucsi_acpi_async_control(struct ucsi *ucsi, u64 command) static const struct ucsi_operations ucsi_acpi_ops = { .read_version = ucsi_acpi_read_version, .read_cci = ucsi_acpi_read_cci, + .poll_cci = ucsi_acpi_poll_cci, .read_message_in = ucsi_acpi_read_message_in, .sync_control = ucsi_sync_control_common, .async_control = ucsi_acpi_async_control @@ -142,6 +148,7 @@ static int ucsi_gram_sync_control(struct ucsi *ucsi, u64 command) static const struct ucsi_operations ucsi_gram_ops = { .read_version = ucsi_acpi_read_version, .read_cci = ucsi_acpi_read_cci, + .poll_cci = ucsi_acpi_poll_cci, .read_message_in = ucsi_gram_read_message_in, .sync_control = ucsi_gram_sync_control, .async_control = ucsi_acpi_async_control diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index 740171f24ef9f..4b1668733a4be 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -664,6 +664,7 @@ static int ucsi_ccg_sync_control(struct ucsi *ucsi, u64 command) static const struct ucsi_operations ucsi_ccg_ops = { .read_version = ucsi_ccg_read_version, .read_cci = ucsi_ccg_read_cci, + .poll_cci = ucsi_ccg_read_cci, .read_message_in = ucsi_ccg_read_message_in, .sync_control = ucsi_ccg_sync_control, .async_control = ucsi_ccg_async_control, diff --git a/drivers/usb/typec/ucsi/ucsi_glink.c b/drivers/usb/typec/ucsi/ucsi_glink.c index fed39d4580905..8af79101a2fc7 100644 --- a/drivers/usb/typec/ucsi/ucsi_glink.c +++ b/drivers/usb/typec/ucsi/ucsi_glink.c @@ -206,6 +206,7 @@ static void pmic_glink_ucsi_connector_status(struct ucsi_connector *con) static const struct ucsi_operations pmic_glink_ucsi_ops = { .read_version = pmic_glink_ucsi_read_version, .read_cci = pmic_glink_ucsi_read_cci, + .poll_cci = pmic_glink_ucsi_read_cci, .read_message_in = pmic_glink_ucsi_read_message_in, .sync_control = ucsi_sync_control_common, .async_control = pmic_glink_ucsi_async_control, diff --git a/drivers/usb/typec/ucsi/ucsi_stm32g0.c b/drivers/usb/typec/ucsi/ucsi_stm32g0.c index 6923fad31d795..57ef7d83a4121 100644 --- a/drivers/usb/typec/ucsi/ucsi_stm32g0.c +++ b/drivers/usb/typec/ucsi/ucsi_stm32g0.c @@ -424,6 +424,7 @@ static irqreturn_t ucsi_stm32g0_irq_handler(int irq, void *data) static const struct ucsi_operations ucsi_stm32g0_ops = { .read_version = ucsi_stm32g0_read_version, .read_cci = ucsi_stm32g0_read_cci, + .poll_cci = ucsi_stm32g0_read_cci, .read_message_in = ucsi_stm32g0_read_message_in, .sync_control = ucsi_sync_control_common, .async_control = ucsi_stm32g0_async_control, diff --git a/drivers/usb/typec/ucsi/ucsi_yoga_c630.c b/drivers/usb/typec/ucsi/ucsi_yoga_c630.c index 4cae85c0dc12a..d33e3f2dd1d80 100644 --- a/drivers/usb/typec/ucsi/ucsi_yoga_c630.c +++ b/drivers/usb/typec/ucsi/ucsi_yoga_c630.c @@ -74,6 +74,7 @@ static int yoga_c630_ucsi_async_control(struct ucsi *ucsi, u64 command) static const struct ucsi_operations yoga_c630_ucsi_ops = { .read_version = yoga_c630_ucsi_read_version, .read_cci = yoga_c630_ucsi_read_cci, + .poll_cci = yoga_c630_ucsi_read_cci, .read_message_in = yoga_c630_ucsi_read_message_in, .sync_control = ucsi_sync_control_common, .async_control = yoga_c630_ucsi_async_control, diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9ac25d08f473e..63612faeab727 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -666,7 +666,7 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, worker, name); - if (!vtsk) + if (IS_ERR(vtsk)) goto free_worker; mutex_init(&worker->mutex); diff --git a/drivers/virt/acrn/hsm.c b/drivers/virt/acrn/hsm.c index c24036c4e51ec..e4e196abdaac9 100644 --- a/drivers/virt/acrn/hsm.c +++ b/drivers/virt/acrn/hsm.c @@ -49,7 +49,7 @@ static int pmcmd_ioctl(u64 cmd, void __user *uptr) switch (cmd & PMCMD_TYPE_MASK) { case ACRN_PMCMD_GET_PX_CNT: case ACRN_PMCMD_GET_CX_CNT: - pm_info = kmalloc(sizeof(u64), GFP_KERNEL); + pm_info = kzalloc(sizeof(u64), GFP_KERNEL); if (!pm_info) return -ENOMEM; @@ -64,7 +64,7 @@ static int pmcmd_ioctl(u64 cmd, void __user *uptr) kfree(pm_info); break; case ACRN_PMCMD_GET_PX_DATA: - px_data = kmalloc(sizeof(*px_data), GFP_KERNEL); + px_data = kzalloc(sizeof(*px_data), GFP_KERNEL); if (!px_data) return -ENOMEM; @@ -79,7 +79,7 @@ static int pmcmd_ioctl(u64 cmd, void __user *uptr) kfree(px_data); break; case ACRN_PMCMD_GET_CX_DATA: - cx_data = kmalloc(sizeof(*cx_data), GFP_KERNEL); + cx_data = kzalloc(sizeof(*cx_data), GFP_KERNEL); if (!cx_data) return -ENOMEM; diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 264b6523fe52f..cf3fb61f4d5ba 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -38,12 +39,6 @@ struct snp_guest_dev { struct miscdevice misc; struct snp_msg_desc *msg_desc; - - union { - struct snp_report_req report; - struct snp_derived_key_req derived_key; - struct snp_ext_report_req ext_report; - } req; }; /* @@ -71,7 +66,7 @@ struct snp_req_resp { static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) { - struct snp_report_req *report_req = &snp_dev->req.report; + struct snp_report_req *report_req __free(kfree) = NULL; struct snp_msg_desc *mdesc = snp_dev->msg_desc; struct snp_report_resp *report_resp; struct snp_guest_req req = {}; @@ -80,6 +75,10 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io if (!arg->req_data || !arg->resp_data) return -EINVAL; + report_req = kzalloc(sizeof(*report_req), GFP_KERNEL_ACCOUNT); + if (!report_req) + return -ENOMEM; + if (copy_from_user(report_req, (void __user *)arg->req_data, sizeof(*report_req))) return -EFAULT; @@ -116,7 +115,7 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) { - struct snp_derived_key_req *derived_key_req = &snp_dev->req.derived_key; + struct snp_derived_key_req *derived_key_req __free(kfree) = NULL; struct snp_derived_key_resp derived_key_resp = {0}; struct snp_msg_desc *mdesc = snp_dev->msg_desc; struct snp_guest_req req = {}; @@ -136,6 +135,10 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque if (sizeof(buf) < resp_len) return -ENOMEM; + derived_key_req = kzalloc(sizeof(*derived_key_req), GFP_KERNEL_ACCOUNT); + if (!derived_key_req) + return -ENOMEM; + if (copy_from_user(derived_key_req, (void __user *)arg->req_data, sizeof(*derived_key_req))) return -EFAULT; @@ -168,16 +171,21 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques struct snp_req_resp *io) { - struct snp_ext_report_req *report_req = &snp_dev->req.ext_report; + struct snp_ext_report_req *report_req __free(kfree) = NULL; struct snp_msg_desc *mdesc = snp_dev->msg_desc; struct snp_report_resp *report_resp; struct snp_guest_req req = {}; int ret, npages = 0, resp_len; sockptr_t certs_address; + struct page *page; if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data)) return -EINVAL; + report_req = kzalloc(sizeof(*report_req), GFP_KERNEL_ACCOUNT); + if (!report_req) + return -ENOMEM; + if (copy_from_sockptr(report_req, io->req_data, sizeof(*report_req))) return -EFAULT; @@ -203,8 +211,20 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques * the host. If host does not supply any certs in it, then copy * zeros to indicate that certificate data was not provided. */ - memset(mdesc->certs_data, 0, report_req->certs_len); npages = report_req->certs_len >> PAGE_SHIFT; + page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + get_order(report_req->certs_len)); + if (!page) + return -ENOMEM; + + req.certs_data = page_address(page); + ret = set_memory_decrypted((unsigned long)req.certs_data, npages); + if (ret) { + pr_err("failed to mark page shared, ret=%d\n", ret); + __free_pages(page, get_order(report_req->certs_len)); + return -EFAULT; + } + cmd: /* * The intermediate response buffer is used while decrypting the @@ -213,10 +233,12 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques */ resp_len = sizeof(report_resp->data) + mdesc->ctx->authsize; report_resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT); - if (!report_resp) - return -ENOMEM; + if (!report_resp) { + ret = -ENOMEM; + goto e_free_data; + } - mdesc->input.data_npages = npages; + req.input.data_npages = npages; req.msg_version = arg->msg_version; req.msg_type = SNP_MSG_REPORT_REQ; @@ -231,7 +253,7 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques /* If certs length is invalid then copy the returned length */ if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) { - report_req->certs_len = mdesc->input.data_npages << PAGE_SHIFT; + report_req->certs_len = req.input.data_npages << PAGE_SHIFT; if (copy_to_sockptr(io->req_data, report_req, sizeof(*report_req))) ret = -EFAULT; @@ -240,7 +262,7 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques if (ret) goto e_free; - if (npages && copy_to_sockptr(certs_address, mdesc->certs_data, report_req->certs_len)) { + if (npages && copy_to_sockptr(certs_address, req.certs_data, report_req->certs_len)) { ret = -EFAULT; goto e_free; } @@ -250,6 +272,13 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques e_free: kfree(report_resp); +e_free_data: + if (npages) { + if (set_memory_encrypted((unsigned long)req.certs_data, npages)) + WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); + else + __free_pages(page, get_order(report_req->certs_len)); + } return ret; } diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig index 11b153e7454e4..eaba28c95e733 100644 --- a/drivers/virt/vboxguest/Kconfig +++ b/drivers/virt/vboxguest/Kconfig @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only config VBOXGUEST tristate "Virtual Box Guest integration support" - depends on (ARM64 || X86) && PCI && INPUT + depends on (ARM64 || X86 || COMPILE_TEST) && PCI && INPUT + depends on HAS_IOPORT help This is a driver for the Virtual Box Guest PCI device used in Virtual Box virtual machines. Enabling this driver will add diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 8a294b9cbcf68..56d0dbe621637 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2950,8 +2950,8 @@ static int virtio_mem_probe(struct virtio_device *vdev) mutex_init(&vm->hotplug_mutex); INIT_LIST_HEAD(&vm->next); spin_lock_init(&vm->removal_lock); - hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vm->retry_timer.function = virtio_mem_timer_expired; + hrtimer_setup(&vm->retry_timer, virtio_mem_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; vm->in_kdump = is_kdump_kernel(); diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c index 7a1096265f180..0820e35ad2e3c 100644 --- a/drivers/watchdog/softdog.c +++ b/drivers/watchdog/softdog.c @@ -187,14 +187,12 @@ static int __init softdog_init(void) watchdog_set_nowayout(&softdog_dev, nowayout); watchdog_stop_on_reboot(&softdog_dev); - hrtimer_init(&softdog_ticktock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - softdog_ticktock.function = softdog_fire; + hrtimer_setup(&softdog_ticktock, softdog_fire, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (IS_ENABLED(CONFIG_SOFT_WATCHDOG_PRETIMEOUT)) { softdog_info.options |= WDIOF_PRETIMEOUT; - hrtimer_init(&softdog_preticktock, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - softdog_preticktock.function = softdog_pretimeout; + hrtimer_setup(&softdog_preticktock, softdog_pretimeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } if (soft_active_on_boot) diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 19698d87dc572..8369fd94fc1a6 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1051,8 +1051,8 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) } kthread_init_work(&wd_data->work, watchdog_ping_work); - hrtimer_init(&wd_data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - wd_data->timer.function = watchdog_timer_expired; + hrtimer_setup(&wd_data->timer, watchdog_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); watchdog_hrtimer_pretimeout_init(wdd); if (wdd->id == 0) { diff --git a/drivers/watchdog/watchdog_hrtimer_pretimeout.c b/drivers/watchdog/watchdog_hrtimer_pretimeout.c index 940b53718a91e..fbc7eecd8b203 100644 --- a/drivers/watchdog/watchdog_hrtimer_pretimeout.c +++ b/drivers/watchdog/watchdog_hrtimer_pretimeout.c @@ -23,8 +23,8 @@ void watchdog_hrtimer_pretimeout_init(struct watchdog_device *wdd) { struct watchdog_core_data *wd_data = wdd->wd_data; - hrtimer_init(&wd_data->pretimeout_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wd_data->pretimeout_timer.function = watchdog_hrtimer_pretimeout; + hrtimer_setup(&wd_data->pretimeout_timer, watchdog_hrtimer_pretimeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } void watchdog_hrtimer_pretimeout_start(struct watchdog_device *wdd) diff --git a/fs/affs/file.c b/fs/affs/file.c index a5a861dd52230..7a71018e3f675 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -596,7 +596,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) BUG_ON(tmp > bsize); AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); affs_fix_checksum(sb, bh); bh->b_state &= ~(1UL << BH_New); @@ -724,7 +724,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); - be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32( + max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size))); affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); written += tmp; @@ -746,7 +747,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); @@ -780,7 +781,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); diff --git a/fs/afs/server.c b/fs/afs/server.c index 038f9d0ae3af8..4504e16b458cc 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -163,6 +163,8 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, rb_insert_color(&server->uuid_rb, &net->fs_servers); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + afs_get_cell(cell, afs_cell_trace_get_server); + added_dup: write_seqlock(&net->fs_addr_lock); estate = rcu_dereference_protected(server->endpoint_state, @@ -442,6 +444,7 @@ static void afs_server_rcu(struct rcu_head *rcu) atomic_read(&server->active), afs_server_trace_free); afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); + afs_put_cell(server->cell, afs_cell_trace_put_server); kfree(server); } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 7e7e567a7f8a2..d20cd902ef949 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(volume->cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server(volume->cell->net, server, + afs_server_trace_put_slist_isort); continue; } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index ca755e8d1a372..1ec1f90e0eb38 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); __bch2_btree_node_to_freelist(bc, b); return b; @@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e371e60e3133e..dece27d9db04e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -996,7 +996,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, } got_good_key: le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); } fsck_err: diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1821f40c161a1..edce594333756 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k } if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); ck->c.cached = true; goto lock; } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 10b805a60f526..caef65adeae49 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -7,9 +7,10 @@ static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags) + enum six_lock_init_flags flags, + gfp_t gfp) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); lockdep_set_notrack_class(&b->lock); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b54ef48eb8cc2..b33ab7af84402 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -13,7 +13,7 @@ #include "btree_iter.h" #include "six.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 337494facac64..642fbc60ecab1 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -340,6 +340,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct printbuf buf = PRINTBUF; prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); prt_str(&buf, "\nk: "); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index a633f83c1ac78..362b3b2f2f2e3 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -31,11 +31,6 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -static inline unsigned int dirent_occupied_size(const struct qstr *name) -{ - return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64); -} - int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 620b284aa34f0..204d765dd74c8 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, ptr1.unwritten == ptr2.unwritten && ptr1.offset == ptr2.offset && ptr1.dev == ptr2.dev && - ptr1.dev == ptr2.dev); + ptr1.gen == ptr2.gen); } void bch2_ptr_swab(struct bkey_s); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index d70d9f634cea9..2c3d46ac70c61 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -152,7 +152,6 @@ int bch2_create_trans(struct btree_trans *trans, if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) @@ -221,7 +220,6 @@ int bch2_link_trans(struct btree_trans *trans, } dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); dir_hash = bch2_hash_info_init(c, dir_u); @@ -324,7 +322,6 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - dir_u->bi_size -= dirent_occupied_size(name); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, @@ -463,14 +460,6 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (mode == BCH_RENAME) { - src_dir_u->bi_size -= dirent_occupied_size(src_name); - dst_dir_u->bi_size += dirent_occupied_size(dst_name); - } - - if (mode == BCH_RENAME_OVERWRITE) - src_dir_u->bi_size -= dirent_occupied_size(src_name); - if (src_inode_u->bi_parent_subvol) src_inode_u->bi_parent_subvol = dst_dir.subvol; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 94bf34b9b65f0..717e7b94c66f8 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9bf316e7b845d..0e85131d0af88 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1978,31 +1978,10 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return ret; } -static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - darray_for_each(w->inodes, i) - if (fsck_err_on(i->inode.bi_size != i->i_size, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_size: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { - i->inode.bi_size = i->i_size; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: - check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 24c294d4634e0..05b1250619ecc 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1021,8 +1021,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, /* allocate journal on a device: */ -static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) +static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; @@ -1150,26 +1150,20 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, return ret; } -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) +static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, + unsigned nr, bool new_fs) { struct journal_device *ja = &ca->journal; - struct closure cl; int ret = 0; + struct closure cl; closure_init_stack(&cl); - down_write(&c->state_lock); - /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) - goto unlock; + return 0; - while (ja->nr < nr) { + while (!ret && ja->nr < nr) { struct disk_reservation disk_res = { 0, 0, 0 }; /* @@ -1182,25 +1176,38 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * filesystem-wide allocation will succeed, this is a device * specific allocation - we can hang here: */ + if (!new_fs) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + } - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; + ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + if (ret == -BCH_ERR_bucket_alloc_blocked || + ret == -BCH_ERR_open_buckets_empty) + ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); - - if (ret && ret != -BCH_ERR_bucket_alloc_blocked) - break; } - bch_err_fn(c, ret); -unlock: + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + down_write(&c->state_lock); + int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); up_write(&c->state_lock); + + bch_err_fn(c, ret); return ret; } @@ -1226,7 +1233,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); + ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 21805509ab9e6..6718dc37c5a35 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -74,20 +74,14 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, struct move_bucket *b, u64 time) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a; - int ret; - if (bch2_bucket_is_open(trans->c, - b->k.bucket.inode, - b->k.bucket.offset)) + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) return 0; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_cached); + int ret = bkey_err(k); if (ret) return ret; @@ -95,13 +89,18 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (!ca) goto out; - a = bch2_alloc_to_v4(k, &_a); + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) + goto out_put; + + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; b->sectors = bch2_bucket_sectors_dirty(*a); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); ret = lru_idx && lru_idx <= time; - +out_put: bch2_dev_put(ca); out: bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 35e07bc8fbd34..051214fdc7352 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -90,10 +90,7 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_directory_size_mismatch) \ + BCH_FSCK_ERR_accounting_key_junk_at_end) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7e7c66a1e1a6b..7c403427fbdb8 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) EXPORT_SYMBOL_GPL(six_lock_exit); void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp) { atomic_set(&lock->state, 0); raw_spin_lock_init(&lock->wait_lock); @@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, * failure if they wish by checking lock->readers, but generally * will not want to treat it as an error. */ - lock->readers = alloc_percpu(unsigned); + lock->readers = alloc_percpu_gfp(unsigned, gfp); } #endif } diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index c142e06b7a3a7..59b851cf8bacc 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -164,18 +164,19 @@ enum six_lock_init_flags { }; void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ -#define six_lock_init(lock, flags) \ +#define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key, flags); \ + __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8037ccbacf6af..a81a7b6c09897 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -69,14 +69,20 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed; + + if (ret) { + mutex_lock(&c->sb_lock); + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + return ret; } const char * const bch2_sb_fields[] = { @@ -1219,9 +1225,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) c->disk_sb.sb->version = cpu_to_le16(new_version); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - if (incompat) + if (incompat) { SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); + } } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index f1ab4f9437203..b4cff9ebdebbf 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); static inline bool bch2_request_incompat_feature(struct bch_fs *c, enum bcachefs_metadata_version version) { - if (unlikely(version > c->sb.version_incompat)) { - if (version > c->sb.version_incompat_allowed) - return false; - bch2_set_version_incompat(c, version); - } - return true; + return likely(version <= c->sb.version_incompat) + ? true + : bch2_set_version_incompat(c, version); } static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae25..7f46abbd6311b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,12 +1187,61 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed3c0d6546c5d..0b568c8d24cbc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) u64 lockend; size_t num_written = 0; ssize_t ret; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9322601ab5c9..38756f8cef463 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1382,8 +1382,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 53b846d99ecea..14f53f7575553 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1330,13 +1330,13 @@ MODULE_PARM_DESC(read_policy, int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { - char param[32] = { 0 }; + char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; - strncpy(param, str, sizeof(param) - 1); + strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf5..265370e79a546 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a07764890550..3f8afbd1ebb55 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7155,6 +7155,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7200,8 +7201,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) diff --git a/fs/coredump.c b/fs/coredump.c index 591700e1b2ce6..4375c70144d0a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -63,6 +63,7 @@ static void free_vma_snapshot(struct coredump_params *cprm); static int core_uses_pid; static unsigned int core_pipe_limit; +static unsigned int core_sort_vma; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; @@ -1026,6 +1027,15 @@ static const struct ctl_table coredump_sysctls[] = { .extra1 = (unsigned int *)&core_file_note_size_min, .extra2 = (unsigned int *)&core_file_note_size_max, }, + { + .procname = "core_sort_vma", + .data = &core_sort_vma, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_coredump_sysctls(void) @@ -1256,8 +1266,9 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) cprm->vma_data_size += m->dump_size; } - sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), - cmp_vma_size, NULL); + if (core_sort_vma) + sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), + cmp_vma_size, NULL); return true; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index cb1b6d0c34545..c294a8fc566da 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -57,10 +57,11 @@ static ssize_t efivarfs_file_write(struct file *file, if (bytes == -ENOENT) { /* - * zero size signals to release that the write deleted - * the variable + * FIXME: temporary workaround for fwupdate, signal + * failed write with a 1 to keep created but not + * written files */ - i_size_write(inode, 0); + i_size_write(inode, 1); } else { i_size_write(inode, datasize + sizeof(attributes)); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -124,7 +125,8 @@ static int efivarfs_file_release(struct inode *inode, struct file *file) struct efivar_entry *var = inode->i_private; inode_lock(inode); - var->removed = (--var->open_count == 0 && i_size_read(inode) == 0); + /* FIXME: temporary work around for fwupdate */ + var->removed = (--var->open_count == 0 && i_size_read(inode) == 1); inode_unlock(inode); if (var->removed) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 09fcf731e65d6..6eae8cf655c12 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -367,6 +367,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + register_pm_notifier(&sfi->pm_nb); + return efivar_init(efivarfs_callback, sb, true); } @@ -552,7 +554,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc) sfi->pm_nb.notifier_call = efivarfs_pm_notify; sfi->pm_nb.priority = 0; - register_pm_notifier(&sfi->pm_nb); return 0; } diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index ce9be95c9172f..9ff825f1502d5 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -141,7 +141,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) return 0; } -void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) +int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; @@ -150,13 +150,17 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) struct exfat_mount_options *opts = &sbi->options; if (!is_valid_cluster(sbi, clu)) - return; + return -EIO; ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + if (!test_bit_le(b, sbi->vol_amap[i]->b_data)) + return -EIO; + clear_bit_le(b, sbi->vol_amap[i]->b_data); + exfat_update_bh(sbi->vol_amap[i], sync); if (opts->discard) { @@ -171,6 +175,8 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) opts->discard = 0; } } + + return 0; } /* diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 78be6964a8a08..d30ce18a88b7a 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -456,7 +456,7 @@ int exfat_count_num_clusters(struct super_block *sb, int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync); -void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); +int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 9e5492ac409b0..6f3651c6ca91e 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -175,6 +175,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu)); if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + int err; unsigned int last_cluster = p_chain->dir + p_chain->size - 1; do { bool sync = false; @@ -189,7 +190,9 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + if (err) + break; clu++; num_clusters++; } while (num_clusters < p_chain->size); @@ -210,12 +213,13 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)))) + break; clu = n_clu; num_clusters++; if (err) - goto dec_used_clus; + break; if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { /* @@ -229,7 +233,6 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain } while (clu != EXFAT_EOF_CLUSTER); } -dec_used_clus: sbi->used_clusters -= num_clusters; return 0; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 05b51e7217838..807349d8ea050 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -587,7 +587,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) valid_size = ei->valid_size; ret = generic_write_checks(iocb, iter); - if (ret < 0) + if (ret <= 0) goto unlock; if (iocb->ki_flags & IOCB_DIRECT) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 691dd77b6ab5f..8b30027d82512 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -232,7 +232,7 @@ static int exfat_search_empty_slot(struct super_block *sb, dentry = 0; } - while (dentry + num_entries < total_entries && + while (dentry + num_entries <= total_entries && clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); @@ -646,6 +646,11 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); + if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { + exfat_fs_error(sb, "data size is invalid(%lld)", info->size); + return -EIO; + } + info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu); if (!is_valid_cluster(sbi, info->start_clu) && info->size) { exfat_warn(sb, "start_clu is invalid cluster(0x%x)", diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5b5f789b37eb6..2c3a4d09e500f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -838,6 +838,12 @@ static int fuse_check_folio(struct folio *folio) return 0; } +/* + * Attempt to steal a page from the splice() pipe and move it into the + * pagecache. If successful, the pointer in @pagep will be updated. The + * folio that was originally in @pagep will lose a reference and the new + * folio returned in @pagep will carry a reference. + */ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; @@ -1451,7 +1457,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -2101,7 +2107,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - unsigned int head, tail, mask, count; + unsigned int head, tail, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -2118,8 +2124,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - count = head - tail; + count = pipe_occupancy(head, tail); bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { @@ -2129,8 +2134,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx != head && rem < len; idx++) - rem += pipe->bufs[idx & mask].len; + for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++) + rem += pipe_buf(pipe, idx)->len; ret = -EINVAL; if (rem < len) @@ -2141,10 +2146,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - if (WARN_ON(nbuf >= count || tail == head)) + if (WARN_ON(nbuf >= count || pipe_empty(head, tail))) goto out_free; - ibuf = &pipe->bufs[tail & mask]; + ibuf = pipe_buf(pipe, tail); obuf = &bufs[nbuf]; if (rem >= ibuf->len) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 198862b086ff7..3805f9b06c9d2 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1636,7 +1636,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d92a54799985..d63e56fd3dd20 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -955,8 +955,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_folios; i++) + for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); + } if (ia->ff) fuse_file_put(ia->ff, false); @@ -1048,7 +1050,14 @@ static void fuse_readahead(struct readahead_control *rac) ap = &ia->ap; while (ap->num_folios < cur_pages) { - folio = readahead_folio(rac); + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + folio = __readahead_folio(rac); ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e8..0e47da82b0c24 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -427,12 +427,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, bio_put(bio); goto zero_tail; } - if (dio->flags & IOMAP_DIO_WRITE) { + if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(n); - } else { - if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - } + else if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); dio->size += n; copied += n; diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b931..ecb7b95c2ca33 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5356,10 +5356,9 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; @@ -5378,8 +5377,23 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode, } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return page_address(page); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); +} +EXPORT_SYMBOL_GPL(page_get_link_raw); + +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 035ba52742a50..4db912f562305 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -780,6 +780,43 @@ int nfs4_inode_return_delegation(struct inode *inode) return 0; } +/** + * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called to request that the delegation be returned as soon + * as the file is closed. If the file is already closed, the delegation is + * immediately returned. + */ +void nfs4_inode_set_return_delegation_on_close(struct inode *inode) +{ + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + spin_lock(&delegation->lock); + if (!delegation->inode) + goto out_unlock; + if (list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out_unlock: + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); +} + /** * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 71524d34ed207..8ff5ab9c5c256 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -49,6 +49,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, unsigned long pagemod_limit, u32 deleg_type); int nfs4_inode_return_delegation(struct inode *inode); void nfs4_inode_return_delegation_on_close(struct inode *inode); +void nfs4_inode_set_return_delegation_on_close(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_evict_delegation(struct inode *inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f45beea92d034..f32f8d7c9122b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -56,6 +56,7 @@ #include #include +#include "delegation.h" #include "internal.h" #include "iostat.h" #include "pnfs.h" @@ -130,6 +131,20 @@ static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, dreq->count = req_start; } +static void nfs_direct_file_adjust_size_locked(struct inode *inode, + loff_t offset, size_t count) +{ + loff_t newsize = offset + (loff_t)count; + loff_t oldsize = i_size_read(inode); + + if (newsize > oldsize) { + i_size_write(inode, newsize); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; + trace_nfs_size_grow(inode, newsize); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + } +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -272,6 +287,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); + nfs_update_delegated_atime(dreq->inode); + while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; @@ -741,6 +758,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; trace_nfs_direct_write_completion(dreq); @@ -762,6 +780,11 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } spin_unlock(&dreq->lock); + spin_lock(&inode->i_lock); + nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + nfs_update_delegated_mtime_locked(dreq->inode); + spin_unlock(&inode->i_lock); + while (!list_empty(&hdr->pages)) { req = nfs_list_entry(hdr->pages.next); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1bb646752e466..033feeab8c346 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -457,7 +458,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) /* If the private flag is set, then the folio is not freeable */ if (folio_test_private(folio)) { if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || - current_is_kswapd()) + current_is_kswapd() || current_is_kcompactd()) return false; if (nfs_wb_folio(folio->mapping->host, folio) < 0) return false; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df9669d4ded7f..6e95db6c17e92 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -133,6 +133,7 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (err) return NULL; + label->lsmid = shim.id; label->label = shim.context; label->len = shim.len; return label; @@ -145,7 +146,7 @@ nfs4_label_release_security(struct nfs4_label *label) if (label) { shim.context = label->label; shim.len = label->len; - shim.id = LSM_ID_UNDEF; + shim.id = label->lsmid; security_release_secctx(&shim); } } @@ -3906,8 +3907,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { + struct dentry *dentry = ctx->dentry; if (ctx->state == NULL) return; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + nfs4_inode_set_return_delegation_on_close(d_inode(dentry)); if (is_sync) nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else @@ -6269,7 +6273,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_fattr fattr = { @@ -6374,7 +6378,7 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel = {0, 0, buflen, (char *)buf }; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; struct nfs_fattr *fattr; int status; diff --git a/fs/nsfs.c b/fs/nsfs.c index 663f8656158d5..f7fddf8ecf73f 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -37,7 +37,6 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations ns_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0c28e5fa34077..d7310fcf38881 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -618,7 +618,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); - dput(upper); if (!err) { /* Restore timestamps on parent (best effort) */ @@ -626,6 +625,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } + dput(upper); } inode_unlock(udir); if (err) diff --git a/fs/pidfs.c b/fs/pidfs.c index 63f9699ebac36..c0478b3c55d9f 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -521,7 +521,6 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations pidfs_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; diff --git a/fs/pipe.c b/fs/pipe.c index ce1af7592780d..4d0799e4e7196 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -210,11 +210,10 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int writers = READ_ONCE(pipe->writers); - return !pipe_empty(head, tail) || !writers; + return !pipe_empty(idx.head, idx.tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, @@ -395,7 +394,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) wake_next_reader = true; mutex_lock(&pipe->mutex); } - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_is_empty(pipe)) wake_next_reader = false; mutex_unlock(&pipe->mutex); @@ -417,11 +416,10 @@ static inline int is_packetized(struct file *file) /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int max_usage = READ_ONCE(pipe->max_usage); - return !pipe_full(head, tail, max_usage) || + return !pipe_full(idx.head, idx.tail, max_usage) || !READ_ONCE(pipe->readers); } @@ -579,11 +577,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); mutex_lock(&pipe->mutex); - was_empty = pipe_empty(pipe->head, pipe->tail); + was_empty = pipe_is_empty(pipe); wake_next_writer = true; } out: - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) wake_next_writer = false; mutex_unlock(&pipe->mutex); @@ -616,7 +614,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - unsigned int count, head, tail, mask; + unsigned int count, head, tail; switch (cmd) { case FIONREAD: @@ -624,10 +622,9 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count = 0; head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - while (tail != head) { - count += pipe->bufs[tail & mask].len; + while (!pipe_empty(head, tail)) { + count += pipe_buf(pipe, tail)->len; tail++; } mutex_unlock(&pipe->mutex); @@ -659,7 +656,7 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, tail; + union pipe_index idx; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); @@ -680,19 +677,18 @@ pipe_poll(struct file *filp, poll_table *wait) * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ - head = READ_ONCE(pipe->head); - tail = READ_ONCE(pipe->tail); + idx.head_tail = READ_ONCE(pipe->head_tail); mask = 0; if (filp->f_mode & FMODE_READ) { - if (!pipe_empty(head, tail)) + if (!pipe_empty(idx.head, idx.tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_pipe != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - if (!pipe_full(head, tail, pipe->max_usage)) + if (!pipe_full(idx.head, idx.tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 699a3f76d0834..64bd68f750f84 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -763,7 +763,7 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, struct cifs_fattr *fattr, bool mode_from_special_sid) { int i; - int num_aces = 0; + u16 num_aces = 0; int acl_size; char *acl_base; struct smb_ace **ppace; @@ -778,14 +778,15 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, } /* validate that we do not go past end of acl */ - if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) || + end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + le16_to_cpu(pdacl->num_aces)); /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, @@ -795,12 +796,15 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, acl_base = (char *)pdacl; acl_size = sizeof(struct smb_acl); - num_aces = le32_to_cpu(pdacl->num_aces); + num_aces = le16_to_cpu(pdacl->num_aces); if (num_aces > 0) { umode_t denied_mode = 0; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); if (!ppace) @@ -937,12 +941,12 @@ unsigned int setup_special_user_owner_ACE(struct smb_ace *pntace) static void populate_new_aces(char *nacl_base, struct smb_sid *pownersid, struct smb_sid *pgrpsid, - __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, + __u64 *pnmode, u16 *pnum_aces, u16 *pnsize, bool modefromsid, bool posix) { __u64 nmode; - u32 num_aces = 0; + u16 num_aces = 0; u16 nsize = 0; __u64 user_mode; __u64 group_mode; @@ -1050,7 +1054,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p u16 size = 0; struct smb_ace *pntace = NULL; char *acl_base = NULL; - u32 src_num_aces = 0; + u16 src_num_aces = 0; u16 nsize = 0; struct smb_ace *pnntace = NULL; char *nacl_base = NULL; @@ -1058,7 +1062,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p acl_base = (char *)pdacl; size = sizeof(struct smb_acl); - src_num_aces = le32_to_cpu(pdacl->num_aces); + src_num_aces = le16_to_cpu(pdacl->num_aces); nacl_base = (char *)pndacl; nsize = sizeof(struct smb_acl); @@ -1090,11 +1094,11 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, u16 size = 0; struct smb_ace *pntace = NULL; char *acl_base = NULL; - u32 src_num_aces = 0; + u16 src_num_aces = 0; u16 nsize = 0; struct smb_ace *pnntace = NULL; char *nacl_base = NULL; - u32 num_aces = 0; + u16 num_aces = 0; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ @@ -1112,7 +1116,7 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, acl_base = (char *)pdacl; size = sizeof(struct smb_acl); - src_num_aces = le32_to_cpu(pdacl->num_aces); + src_num_aces = le16_to_cpu(pdacl->num_aces); /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { @@ -1158,7 +1162,7 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, } finalize_dacl: - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(nsize); return 0; @@ -1293,7 +1297,7 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); ndacl_ptr->size = cpu_to_le16(0); - ndacl_ptr->num_aces = cpu_to_le32(0); + ndacl_ptr->num_aces = cpu_to_le16(0); rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid, posix); @@ -1653,7 +1657,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); if (mode_from_sid) nsecdesclen += - le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace); + le16_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace); else /* cifsacl */ nsecdesclen += le16_to_cpu(dacl_ptr->size); } diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3feaa0f681699..d07682020c645 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1338,7 +1338,8 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->credits.value = 0; rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; - queue_work(cifsiod_wq, &rdata->subreq.work); + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(&rdata->subreq); release_mid(mid); add_credits(server, &credits, 0); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 23e0c8be7fb52..4dd11eafb69d9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4965,6 +4965,10 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_buffer = (char *)cifs_buf_get(); else next_buffer = (char *)cifs_small_buf_get(); + if (!next_buffer) { + cifs_server_dbg(VFS, "No memory for (large) SMB response\n"); + return -1; + } memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd); } diff --git a/fs/smb/common/smbacl.h b/fs/smb/common/smbacl.h index 6a60698fc6f0f..a624ec9e4a144 100644 --- a/fs/smb/common/smbacl.h +++ b/fs/smb/common/smbacl.h @@ -107,7 +107,8 @@ struct smb_sid { struct smb_acl { __le16 revision; /* revision level */ __le16 size; - __le32 num_aces; + __le16 num_aces; + __le16 reserved; } __attribute__((packed)); struct smb_ace { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f1efcd0274750..c53121538990e 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -7458,17 +7458,17 @@ int smb2_lock(struct ksmbd_work *work) } no_check_cl: + flock = smb_lock->fl; + list_del(&smb_lock->llist); + if (smb_lock->zero_len) { err = 0; goto skip; } - - flock = smb_lock->fl; - list_del(&smb_lock->llist); retry: rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL); skip: - if (flags & SMB2_LOCKFLAG_UNLOCK) { + if (smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) { if (!rc) { ksmbd_debug(SMB, "File unlocked\n"); } else if (rc == -ENOENT) { diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index d39d3e553366d..49b128698670a 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -333,7 +333,7 @@ void posix_state_to_acl(struct posix_acl_state *state, pace->e_perm = state->other.allow; } -int init_acl_state(struct posix_acl_state *state, int cnt) +int init_acl_state(struct posix_acl_state *state, u16 cnt) { int alloc; @@ -368,7 +368,7 @@ static void parse_dacl(struct mnt_idmap *idmap, struct smb_fattr *fattr) { int i, ret; - int num_aces = 0; + u16 num_aces = 0; unsigned int acl_size; char *acl_base; struct smb_ace **ppace; @@ -389,16 +389,18 @@ static void parse_dacl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "DACL revision %d size %d num aces %d\n", le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + le16_to_cpu(pdacl->num_aces)); acl_base = (char *)pdacl; acl_size = sizeof(struct smb_acl); - num_aces = le32_to_cpu(pdacl->num_aces); + num_aces = le16_to_cpu(pdacl->num_aces); if (num_aces <= 0) return; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; ret = init_acl_state(&acl_state, num_aces); @@ -432,6 +434,7 @@ static void parse_dacl(struct mnt_idmap *idmap, offsetof(struct smb_sid, sub_auth); if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || (end_of_acl - acl_base < acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || @@ -580,7 +583,7 @@ static void parse_dacl(struct mnt_idmap *idmap, static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, struct smb_ace *pndace, - struct smb_fattr *fattr, u32 *num_aces, + struct smb_fattr *fattr, u16 *num_aces, u16 *size, u32 nt_aces_num) { struct posix_acl_entry *pace; @@ -701,7 +704,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap, struct smb_fattr *fattr) { struct smb_ace *ntace, *pndace; - int nt_num_aces = le32_to_cpu(nt_dacl->num_aces), num_aces = 0; + u16 nt_num_aces = le16_to_cpu(nt_dacl->num_aces), num_aces = 0; unsigned short size = 0; int i; @@ -728,7 +731,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap, set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, nt_num_aces); - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } @@ -736,7 +739,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_fattr *fattr) { struct smb_ace *pace, *pndace; - u32 num_aces = 0; + u16 num_aces = 0; u16 size = 0, ace_size = 0; uid_t uid; const struct smb_sid *sid; @@ -792,7 +795,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap, fattr->cf_mode, 0007); out: - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } @@ -807,6 +810,13 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) return -EINVAL; } + if (!psid->num_subauth) + return 0; + + if (psid->num_subauth > SID_MAX_SUB_AUTHORITIES || + end_of_acl < (char *)psid + 8 + sizeof(__le32) * psid->num_subauth) + return -EINVAL; + return 0; } @@ -848,6 +858,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, pntsd->type = cpu_to_le16(DACL_PRESENT); if (pntsd->osidoffset) { + if (le32_to_cpu(pntsd->osidoffset) < sizeof(struct smb_ntsd)) + return -EINVAL; + rc = parse_sid(owner_sid_ptr, end_of_acl); if (rc) { pr_err("%s: Error %d parsing Owner SID\n", __func__, rc); @@ -863,6 +876,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, } if (pntsd->gsidoffset) { + if (le32_to_cpu(pntsd->gsidoffset) < sizeof(struct smb_ntsd)) + return -EINVAL; + rc = parse_sid(group_sid_ptr, end_of_acl); if (rc) { pr_err("%s: Error %d mapping Owner SID to gid\n", @@ -884,6 +900,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, pntsd->type |= cpu_to_le16(DACL_PROTECTED); if (dacloffset) { + if (dacloffset < sizeof(struct smb_ntsd)) + return -EINVAL; + parse_dacl(idmap, dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); } @@ -1006,8 +1025,9 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_sid owner_sid, group_sid; struct dentry *parent = path->dentry->d_parent; struct mnt_idmap *idmap = mnt_idmap(path->mnt); - int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size; - int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; + int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size; + int rc = 0, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; + u16 num_aces, ace_cnt = 0; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); @@ -1023,7 +1043,7 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset); acl_len = pntsd_size - dacloffset; - num_aces = le32_to_cpu(parent_pdacl->num_aces); + num_aces = le16_to_cpu(parent_pdacl->num_aces); pntsd_type = le16_to_cpu(parent_pntsd->type); pdacl_size = le16_to_cpu(parent_pdacl->size); @@ -1183,7 +1203,7 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); pdacl->revision = cpu_to_le16(2); pdacl->size = cpu_to_le16(sizeof(struct smb_acl) + nt_size); - pdacl->num_aces = cpu_to_le32(ace_cnt); + pdacl->num_aces = cpu_to_le16(ace_cnt); pace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); memcpy(pace, aces_base, nt_size); pntsd_size += sizeof(struct smb_acl) + nt_size; @@ -1264,7 +1284,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); - for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { if (offsetof(struct smb_ace, access_req) > aces_size) break; ace_size = le16_to_cpu(ace->size); @@ -1285,7 +1305,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); - for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { if (offsetof(struct smb_ace, access_req) > aces_size) break; ace_size = le16_to_cpu(ace->size); diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h index 24ce576fc2924..355adaee39b87 100644 --- a/fs/smb/server/smbacl.h +++ b/fs/smb/server/smbacl.h @@ -86,7 +86,7 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr); -int init_acl_state(struct posix_acl_state *state, int cnt); +int init_acl_state(struct posix_acl_state *state, u16 cnt); void free_acl_state(struct posix_acl_state *state); void posix_state_to_acl(struct posix_acl_state *state, struct posix_acl_entry *pace); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 0460ebea6ff02..3f185ae60dc51 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -281,6 +281,7 @@ static int handle_response(int type, void *payload, size_t sz) if (entry->type + 1 != type) { pr_err("Waiting for IPC type %d, got %d. Ignore.\n", entry->type + 1, type); + continue; } entry->response = kvzalloc(sz, KSMBD_DEFAULT_GFP); diff --git a/fs/splice.c b/fs/splice.c index 28cfa63aa2364..23fa5561b9441 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -331,7 +331,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, int i; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -527,7 +527,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des return -ERESTARTSYS; repeat: - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; @@ -820,7 +820,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, if (signal_pending(current)) break; - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; @@ -968,7 +968,7 @@ static ssize_t do_splice_read(struct file *in, loff_t *ppos, return 0; /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) @@ -1080,7 +1080,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; - WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); + WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; @@ -1268,7 +1268,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; @@ -1652,13 +1652,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_empty(pipe->head, pipe->tail)) + if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1688,13 +1688,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; diff --git a/fs/timerfd.c b/fs/timerfd.c index 9f7eb451a60f6..cee007e0d9780 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { - hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); - ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { @@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else - hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01d8eb1703820..a79f229df4754 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->c = c; wbuf->next_ino = 0; - hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wbuf->timer.function = wbuf_timer_callback_nolock; + hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15bb790359f81..5d560e9073f42 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -29,11 +29,6 @@ struct kmem_cache *xfs_buf_cache; /* * Locking orders * - * xfs_buf_ioacct_inc: - * xfs_buf_ioacct_dec: - * b_sema (caller holds) - * b_lock - * * xfs_buf_stale: * b_sema (caller holds) * b_lock @@ -81,51 +76,6 @@ xfs_buf_vmap_len( return (bp->b_page_count * PAGE_SIZE); } -/* - * Bump the I/O in flight count on the buftarg if we haven't yet done so for - * this buffer. The count is incremented once per buffer (per hold cycle) - * because the corresponding decrement is deferred to buffer release. Buffers - * can undergo I/O multiple times in a hold-release cycle and per buffer I/O - * tracking adds unnecessary overhead. This is used for sychronization purposes - * with unmount (see xfs_buftarg_drain()), so all we really need is a count of - * in-flight buffers. - * - * Buffers that are never released (e.g., superblock, iclog buffers) must set - * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count - * never reaches zero and unmount hangs indefinitely. - */ -static inline void -xfs_buf_ioacct_inc( - struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_NO_IOACCT) - return; - - ASSERT(bp->b_flags & XBF_ASYNC); - spin_lock(&bp->b_lock); - if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { - bp->b_state |= XFS_BSTATE_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); - } - spin_unlock(&bp->b_lock); -} - -/* - * Clear the in-flight state on a buffer about to be released to the LRU or - * freed and unaccount from the buftarg. - */ -static inline void -__xfs_buf_ioacct_dec( - struct xfs_buf *bp) -{ - lockdep_assert_held(&bp->b_lock); - - if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { - bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); - } -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -149,15 +99,7 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - /* - * Once the buffer is marked stale and unlocked, a subsequent lookup - * could reset b_flags. There is no guarantee that the buffer is - * unaccounted (released to LRU) before that occurs. Drop in-flight - * status now to preserve accounting consistency. - */ spin_lock(&bp->b_lock); - __xfs_buf_ioacct_dec(bp); - atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) @@ -794,18 +736,13 @@ xfs_buf_get_map( int _xfs_buf_read( - struct xfs_buf *bp, - xfs_buf_flags_t flags) + struct xfs_buf *bp) { - ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); - bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - + bp->b_flags |= XBF_READ; xfs_buf_submit(bp); - if (flags & XBF_ASYNC) - return 0; return xfs_buf_iowait(bp); } @@ -857,6 +794,8 @@ xfs_buf_read_map( struct xfs_buf *bp; int error; + ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); + flags |= XBF_READ; *bpp = NULL; @@ -870,21 +809,11 @@ xfs_buf_read_map( /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - error = _xfs_buf_read(bp, flags); - - /* Readahead iodone already dropped the buffer, so exit. */ - if (flags & XBF_ASYNC) - return 0; + error = _xfs_buf_read(bp); } else { /* Buffer already read; all we need to do is check it. */ error = xfs_buf_reverify(bp, ops); - /* Readahead already finished; drop the buffer and exit. */ - if (flags & XBF_ASYNC) { - xfs_buf_relse(bp); - return 0; - } - /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; ASSERT(bp->b_ops != NULL || ops == NULL); @@ -936,6 +865,7 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; struct xfs_buf *bp; /* @@ -945,9 +875,21 @@ xfs_buf_readahead_map( if (xfs_buftarg_is_mem(target)) return; - xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, - __this_address); + if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) + return; + trace_xfs_buf_readahead(bp, 0, _RET_IP_); + + if (bp->b_flags & XBF_DONE) { + xfs_buf_reverify(bp, ops); + xfs_buf_relse(bp); + return; + } + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + bp->b_flags &= ~(XBF_WRITE | XBF_DONE); + bp->b_flags |= flags; + percpu_counter_inc(&target->bt_readahead_count); + xfs_buf_submit(bp); } /* @@ -1003,10 +945,12 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + /* there are currently no valid flags for xfs_buf_get_uncached */ + ASSERT(flags == 0); + *bpp = NULL; - /* flags might contain irrelevant bits, pass only what we care about */ - error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); + error = _xfs_buf_alloc(target, &map, 1, flags, &bp); if (error) return error; @@ -1060,7 +1004,6 @@ xfs_buf_rele_uncached( spin_unlock(&bp->b_lock); return; } - __xfs_buf_ioacct_dec(bp); spin_unlock(&bp->b_lock); xfs_buf_free(bp); } @@ -1079,20 +1022,12 @@ xfs_buf_rele_cached( spin_lock(&bp->b_lock); ASSERT(bp->b_hold >= 1); if (bp->b_hold > 1) { - /* - * Drop the in-flight state if the buffer is already on the LRU - * and it holds the only reference. This is racy because we - * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT - * ensures the decrement occurs only once per-buf. - */ - if (--bp->b_hold == 1 && !list_empty(&bp->b_lru)) - __xfs_buf_ioacct_dec(bp); + bp->b_hold--; goto out_unlock; } /* we are asked to drop the last reference */ - __xfs_buf_ioacct_dec(bp); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + if (atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU, keep the reference to the * buffer for the LRU and clear the (now stale) dispose list @@ -1345,6 +1280,7 @@ xfs_buf_ioend_handle_error( resubmit: xfs_buf_ioerror(bp, 0); bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + reinit_completion(&bp->b_iowait); xfs_buf_submit(bp); return true; out_stale: @@ -1355,8 +1291,9 @@ xfs_buf_ioend_handle_error( return false; } -static void -xfs_buf_ioend( +/* returns false if the caller needs to resubmit the I/O, else true */ +static bool +__xfs_buf_ioend( struct xfs_buf *bp) { trace_xfs_buf_iodone(bp, _RET_IP_); @@ -1369,6 +1306,8 @@ xfs_buf_ioend( bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; + if (bp->b_flags & XBF_READ_AHEAD) + percpu_counter_dec(&bp->b_target->bt_readahead_count); } else { if (!bp->b_error) { bp->b_flags &= ~XBF_WRITE_FAIL; @@ -1376,7 +1315,7 @@ xfs_buf_ioend( } if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) - return; + return false; /* clear the retry state */ bp->b_last_error = 0; @@ -1397,7 +1336,15 @@ xfs_buf_ioend( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | _XBF_LOGRECOVERY); + return true; +} +static void +xfs_buf_ioend( + struct xfs_buf *bp) +{ + if (!__xfs_buf_ioend(bp)) + return; if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else @@ -1411,15 +1358,8 @@ xfs_buf_ioend_work( struct xfs_buf *bp = container_of(work, struct xfs_buf, b_ioend_work); - xfs_buf_ioend(bp); -} - -static void -xfs_buf_ioend_async( - struct xfs_buf *bp) -{ - INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); - queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); + if (__xfs_buf_ioend(bp)) + xfs_buf_relse(bp); } void @@ -1491,7 +1431,13 @@ xfs_buf_bio_end_io( XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend_async(bp); + if (bp->b_flags & XBF_ASYNC) { + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); + } else { + complete(&bp->b_iowait); + } + bio_put(bio); } @@ -1568,9 +1514,11 @@ xfs_buf_iowait( { ASSERT(!(bp->b_flags & XBF_ASYNC)); - trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); - trace_xfs_buf_iowait_done(bp, _RET_IP_); + do { + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); + trace_xfs_buf_iowait_done(bp, _RET_IP_); + } while (!__xfs_buf_ioend(bp)); return bp->b_error; } @@ -1648,9 +1596,6 @@ xfs_buf_submit( */ bp->b_error = 0; - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioacct_inc(bp); - if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); xfs_buf_ioend(bp); @@ -1776,9 +1721,8 @@ xfs_buftarg_wait( struct xfs_buftarg *btp) { /* - * First wait on the buftarg I/O count for all in-flight buffers to be - * released. This is critical as new buffers do not make the LRU until - * they are released. + * First wait for all in-flight readahead buffers to be released. This is + * critical as new buffers do not make the LRU until they are released. * * Next, flush the buffer workqueue to ensure all completion processing * has finished. Just waiting on buffer locks is not sufficient for @@ -1787,7 +1731,7 @@ xfs_buftarg_wait( * all reference counts have been dropped before we start walking the * LRU list. */ - while (percpu_counter_sum(&btp->bt_io_count)) + while (percpu_counter_sum(&btp->bt_readahead_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); } @@ -1904,8 +1848,8 @@ xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); - percpu_counter_destroy(&btp->bt_io_count); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); + percpu_counter_destroy(&btp->bt_readahead_count); list_lru_destroy(&btp->bt_lru); } @@ -1959,7 +1903,7 @@ xfs_init_buftarg( if (list_lru_init(&btp->bt_lru)) return -ENOMEM; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) goto out_destroy_lru; btp->bt_shrinker = @@ -1973,7 +1917,7 @@ xfs_init_buftarg( return 0; out_destroy_io_count: - percpu_counter_destroy(&btp->bt_io_count); + percpu_counter_destroy(&btp->bt_readahead_count); out_destroy_lru: list_lru_destroy(&btp->bt_lru); return -ENOMEM; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3b4ed42e11c01..80e06eecaf56e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -27,7 +27,6 @@ struct xfs_buf; #define XBF_READ (1u << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ #define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ @@ -58,7 +57,6 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ @@ -77,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t; * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ -#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { struct rhashtable bc_hash; @@ -116,7 +113,7 @@ struct xfs_buftarg { struct shrinker *bt_shrinker; struct list_lru bt_lru; - struct percpu_counter bt_io_count; + struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; /* Atomic write unit values */ @@ -291,7 +288,7 @@ int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); -int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); +int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 07bebbfb16ee1..5b64a2b3b113f 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -117,7 +117,7 @@ xmbuf_free( struct xfs_buftarg *btp) { ASSERT(xfs_buftarg_is_mem(btp)); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); trace_xmbuf_free(btp); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b3c27dbccce86..2f76531842f83 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3380,7 +3380,7 @@ xlog_do_recover( */ xfs_buf_lock(bp); xfs_buf_hold(bp); - error = _xfs_buf_read(bp, XBF_READ); + error = _xfs_buf_read(bp); if (error) { if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 477c5262cf912..b69356582b86f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -181,14 +181,11 @@ xfs_readsb( /* * Allocate a (locked) buffer to hold the superblock. This will be kept - * around at all times to optimize access to the superblock. Therefore, - * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count - * elevated. + * around at all times to optimize access to the superblock. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), XBF_NO_IOACCT, &bp, - buf_ops); + BTOBB(sector_size), 0, &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d8e6d073d64dc..57bef567e0116 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1407,7 +1407,7 @@ xfs_rtmount_readsb( /* m_blkbb_log is not set up yet */ error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, - mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp, + mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp, &xfs_rtsb_buf_ops); if (error) { xfs_warn(mp, "rt sb validate failed with error %d.", error); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b29462363b815..bfc2f12490224 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -593,6 +593,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \ DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead); TRACE_EVENT(xfs_buf_ioerror, TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip), diff --git a/include/acpi/processor.h b/include/acpi/processor.h index a17e97e634a68..d0eccbd920e5c 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -280,6 +280,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg); void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); #else static inline void acpi_processor_power_init_bm_check(struct acpi_processor_flags @@ -300,6 +301,10 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx { return; } +static inline void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + BUG(); +} #endif static inline int call_on_cpu(int cpu, long (*fn)(void *), void *arg, diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f42133dae68e5..2afc95bf1655f 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -90,7 +90,7 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index c768de6f19a9a..0755bc39b0d80 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -39,7 +39,7 @@ extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; -extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; +extern char __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 01dafd604188f..b550afa15ecd1 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,24 +4,35 @@ #ifndef __ASSEMBLY__ -#ifndef __arch_get_k_vdso_data -static __always_inline struct vdso_data *__arch_get_k_vdso_data(void) +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE + +#ifndef __arch_get_vdso_u_time_data +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return NULL; + return &vdso_u_time_data; } -#endif /* __arch_get_k_vdso_data */ +#endif + +#ifndef __arch_get_vdso_u_rng_data +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) +{ + return &vdso_u_rng_data; +} +#endif + +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ #ifndef __arch_update_vsyscall -static __always_inline void __arch_update_vsyscall(struct vdso_data *vdata) +static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata) { } #endif /* __arch_update_vsyscall */ -#ifndef __arch_sync_vdso_data -static __always_inline void __arch_sync_vdso_data(struct vdso_data *vdata) +#ifndef __arch_sync_vdso_time_data +static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { } -#endif /* __arch_sync_vdso_data */ +#endif /* __arch_sync_vdso_time_data */ #endif /* !__ASSEMBLY__ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 02a4adb4a9999..58a635a6d5bdf 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -385,6 +385,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN(PAGE_SIZE); \ __nosave_end = .; +#define CACHE_HOT_DATA(align) \ + . = ALIGN(align); \ + *(SORT_BY_ALIGNMENT(.data..hot.*)) \ + . = ALIGN(align); + #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ *(.data..page_aligned) \ @@ -404,7 +409,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_init_stack = .; \ init_thread_union = .; \ init_stack = .; \ - KEEP(*(.data..init_task)) \ KEEP(*(.data..init_thread_info)) \ . = __start_init_stack + THREAD_SIZE; \ __end_init_stack = .; @@ -457,7 +461,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN((align)); \ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ - *(.rodata) *(.rodata.*) \ + *(.rodata) *(.rodata.*) *(.data.rel.ro*) \ SCHED_DATA \ RO_AFTER_INIT_DATA /* Read only after init */ \ . = ALIGN(8); \ @@ -1062,10 +1066,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #define PERCPU_INPUT(cacheline) \ __per_cpu_start = .; \ - *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ + __per_cpu_hot_start = .; \ + *(SORT_BY_ALIGNMENT(.data..percpu..hot.*)) \ + __per_cpu_hot_end = .; \ + . = ALIGN(cacheline); \ *(.data..percpu..read_mostly) \ . = ALIGN(cacheline); \ *(.data..percpu) \ @@ -1074,52 +1081,17 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __per_cpu_end = .; /** - * PERCPU_VADDR - define output section for percpu area + * PERCPU_SECTION - define output section for percpu area * @cacheline: cacheline size - * @vaddr: explicit base address (optional) - * @phdr: destination PHDR (optional) * * Macro which expands to output section for percpu area. * * @cacheline is used to align subsections to avoid false cacheline * sharing between subsections for different purposes. - * - * If @vaddr is not blank, it specifies explicit base address and all - * percpu symbols will be offset from the given address. If blank, - * @vaddr always equals @laddr + LOAD_OFFSET. - * - * @phdr defines the output PHDR to use if not blank. Be warned that - * output PHDR is sticky. If @phdr is specified, the next output - * section in the linker script will go there too. @phdr should have - * a leading colon. - * - * Note that this macros defines __per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU_SECTION. - */ -#define PERCPU_VADDR(cacheline, vaddr, phdr) \ - __per_cpu_load = .; \ - .data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \ - PERCPU_INPUT(cacheline) \ - } phdr \ - . = __per_cpu_load + SIZEOF(.data..percpu); - -/** - * PERCPU_SECTION - define output section for percpu area, simple version - * @cacheline: cacheline size - * - * Align to PAGE_SIZE and outputs output section for percpu area. This - * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and - * __per_cpu_start will be identical. - * - * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) - * except that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 configuration. */ #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ - __per_cpu_load = .; \ PERCPU_INPUT(cacheline) \ } @@ -1148,6 +1120,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) INIT_TASK_DATA(inittask) \ NOSAVE_DATA \ PAGE_ALIGNED_DATA(pagealigned) \ + CACHE_HOT_DATA(cacheline) \ CACHELINE_ALIGNED_DATA(cacheline) \ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640f..a70e62d69dc79 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -330,7 +330,6 @@ static inline bool acpi_sci_irq_valid(void) } extern int sbf_port; -extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); diff --git a/include/linux/align.h b/include/linux/align.h index 2b4acec7b95a2..55debf105a5d6 100644 --- a/include/linux/align.h +++ b/include/linux/align.h @@ -2,14 +2,6 @@ #ifndef _LINUX_ALIGN_H #define _LINUX_ALIGN_H -#include - -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#include #endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fa2a76cc2f73d..71f4f0cc3dac6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -28,7 +28,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ -enum { +enum rqf_flags { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c9..d37751789bf58 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,10 +196,11 @@ struct gendisk { unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; - unsigned int zone_wplugs_hash_bits; - spinlock_t zone_wplugs_lock; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; - struct hlist_head *zone_wplugs_hash; + struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ @@ -367,6 +368,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; diff --git a/include/linux/cache.h b/include/linux/cache.h index ca2a05682a54b..e69768f50d532 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -3,16 +3,13 @@ #define __LINUX_CACHE_H #include +#include #include #ifndef L1_CACHE_ALIGN #define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES) #endif -#ifndef SMP_CACHE_BYTES -#define SMP_CACHE_BYTES L1_CACHE_BYTES -#endif - /** * SMP_CACHE_ALIGN - align a value to the L2 cacheline size * @x: value to align @@ -63,10 +60,6 @@ #define __ro_after_init __section(".data..ro_after_init") #endif -#ifndef ____cacheline_aligned -#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) -#endif - #ifndef ____cacheline_aligned_in_smp #ifdef CONFIG_SMP #define ____cacheline_aligned_in_smp ____cacheline_aligned diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb00..13cd6469e7e56 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -26,20 +26,41 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +/* + * call_once - Ensure a function has been called exactly once + * + * @once: Tracking struct + * @cb: Function to be called + * + * If @once has never completed successfully before, call @cb and, if + * it returns a zero or positive value, mark @once as completed. Return + * the value returned by @cb + * + * If @once has completed succesfully before, return 0. + * + * The call to @cb is implicitly surrounded by a mutex, though for + * efficiency the * function avoids taking it after the first call. + */ +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; - - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; - - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + int r, state; + + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; + + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r < 0) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ diff --git a/include/linux/cfi.h b/include/linux/cfi.h index f0df518e11dd1..1db17ecbb86c6 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,8 @@ #include #include +extern bool cfi_warn; + #ifndef cfi_get_offset static inline int cfi_get_offset(void) { diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ec00e3f7af2b3..ee2614adb7858 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -212,7 +212,7 @@ const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) + ((typeof(p)) __must_check_fn((__force const volatile void *)__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e947764960496..7bf0c521db634 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -80,6 +80,11 @@ static inline unsigned long compact_gap(unsigned int order) return 2UL << order; } +static inline int current_is_kcompactd(void) +{ + return current->flags & PF_KCOMPACTD; +} + #ifdef CONFIG_COMPACTION extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 200fd3c5bc70d..e80d0f04d78f4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") +#define __annotate_jump_table __section(".data.rel.ro.c_jump_table") #else /* !CONFIG_OBJTOOL */ #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ @@ -212,6 +212,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +#define KCFI_REFERENCE(sym) __ADDRESSABLE(sym) +#else +#define KCFI_REFERENCE(sym) +#endif + /** * offset_to_ptr - convert a relative memory offset to an absolute pointer * @off: the address of the 32-bit offset value diff --git a/include/linux/cred.h b/include/linux/cred.h index 0c3c4b16b469c..5658a3bfe803c 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -172,18 +172,12 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) static inline const struct cred *override_creds(const struct cred *override_cred) { - const struct cred *old = current->cred; - - rcu_assign_pointer(current->cred, override_cred); - return old; + return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { - const struct cred *override_cred = current->cred; - - rcu_assign_pointer(current->cred, revert_cred); - return override_cred; + return rcu_replace_pointer(current->cred, revert_cred, 1); } /** diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 64130ae19690a..65655a5d1be28 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -65,6 +65,37 @@ enum execmem_range_flags { * Architectures that use EXECMEM_ROX_CACHE must implement this. */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); + +/** + * execmem_make_temp_rw - temporarily remap region with read-write + * permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Remaps a part of the cached large page in the ROX cache in the range + * [@ptr, @ptr + @size) as writable and not executable. The caller must + * have exclusive ownership of this range and ensure nothing will try to + * execute code in this range. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_make_temp_rw(void *ptr, size_t size); + +/** + * execmem_restore_rox - restore read-only-execute permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) + * after it was temporarily remapped as writable. Relies on architecture + * implementation of set_memory_rox() to restore mapping using large pages. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_restore_rox(void *ptr, size_t size); +#else +static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } +static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f7..2788df98080f8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, - iocb->ki_pos + count); + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; @@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f7bfdcf0dda3f..88e0788711581 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -223,6 +223,11 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) } #endif +static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + /* Exported timer functions: */ /* Initialize timers: */ @@ -333,6 +338,7 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) static inline void hrtimer_update_function(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *)) { +#ifdef CONFIG_PROVE_LOCKING guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); if (WARN_ON_ONCE(hrtimer_is_queued(timer))) @@ -340,7 +346,7 @@ static inline void hrtimer_update_function(struct hrtimer *timer, if (WARN_ON_ONCE(!function)) return; - +#endif timer->function = function; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f959..76a75ec03dd6b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -682,6 +682,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); +void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1004,7 +1005,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif @@ -1066,6 +1069,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, return 0; } +static inline void wait_for_freed_hugetlb_folios(void) +{ +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6a..cd729be369b36 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; @@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); +struct __class_idr { + struct idr *idr; + int id; +}; + +#define idr_null ((struct __class_idr){ NULL, -1 }) +#define take_idr_id(id) __get_and_null(id, idr_null) + +DEFINE_CLASS(idr_alloc, struct __class_idr, + if (_T.id >= 0) idr_remove(_T.idr, _T.id), + ((struct __class_idr){ + .idr = idr, + .id = idr_alloc(idr, ptr, start, end, gfp), + }), + struct idr *idr, void *ptr, int start, int end, gfp_t gfp); + /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. diff --git a/include/linux/irq.h b/include/linux/irq.h index 8daa17f0107ac..dd5df1e2d032d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI + * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { @@ -537,6 +538,8 @@ struct irq_chip { int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); + void (*irq_force_complete_move)(struct irq_data *data); + unsigned long flags; }; @@ -612,6 +615,7 @@ extern int irq_affinity_online_cpu(unsigned int cpu); #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) +bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { @@ -619,11 +623,10 @@ static inline void irq_move_irq(struct irq_data *data) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); -void irq_force_complete_move(struct irq_desc *desc); #else +static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } -static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; diff --git a/include/linux/irqchip/irq-davinci-cp-intc.h b/include/linux/irqchip/irq-davinci-cp-intc.h deleted file mode 100644 index 8d71ed5b5a61f..0000000000000 --- a/include/linux/irqchip/irq-davinci-cp-intc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2019 Texas Instruments - */ - -#ifndef _LINUX_IRQ_DAVINCI_CP_INTC_ -#define _LINUX_IRQ_DAVINCI_CP_INTC_ - -#include - -/** - * struct davinci_cp_intc_config - configuration data for davinci-cp-intc - * driver. - * - * @reg: register range to map - * @num_irqs: number of HW interrupts supported by the controller - */ -struct davinci_cp_intc_config { - struct resource reg; - unsigned int num_irqs; -}; - -int davinci_cp_intc_init(const struct davinci_cp_intc_config *config); - -#endif /* _LINUX_IRQ_DAVINCI_CP_INTC_ */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e432b6a12a32f..33ff41eef8f73 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -350,13 +350,13 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token); -extern void irq_set_default_host(struct irq_domain *host); -extern struct irq_domain *irq_get_default_host(void); -extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node, - const struct irq_affinity_desc *affinity); +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); +void irq_set_default_host(struct irq_domain *host); +struct irq_domain *irq_get_default_host(void); +int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node, + const struct irq_affinity_desc *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -370,8 +370,8 @@ static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -extern void irq_domain_update_bus_token(struct irq_domain *domain, - enum irq_domain_bus_token bus_token); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token); static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, @@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -extern unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *host); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,19 +507,19 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -extern void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *host); -extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq); -extern void irq_domain_associate_many(struct irq_domain *domain, - unsigned int irq_base, - irq_hw_number_t hwirq_base, int count); +int irq_domain_associate(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq); +void irq_domain_associate_many(struct irq_domain *domain, + unsigned int irq_base, + irq_hw_number_t hwirq_base, int count); -extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity); -extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); -extern void irq_dispose_mapping(unsigned int virq); +unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); +void irq_dispose_mapping(unsigned int virq); static inline unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq) @@ -527,9 +527,9 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host, return irq_create_mapping_affinity(host, hwirq, NULL); } -extern struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq, - unsigned int *irq); +struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq, + unsigned int *irq); static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -587,19 +587,21 @@ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); /* V2 interfaces to support hierarchy IRQ domains. */ -extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, - unsigned int virq); -extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, - void *handler_data, const char *handler_name); -extern void irq_domain_reset_irq_data(struct irq_data *irq_data); +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq); +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name); +void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -extern struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data); static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, @@ -613,13 +615,13 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par ops, host_data); } -extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, - const struct irq_affinity_desc *affinity); -extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); -extern void irq_domain_deactivate_irq(struct irq_data *irq_data); +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, + const struct irq_affinity_desc *affinity); +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); +int irq_domain_activate_irq(struct irq_data *irq_data, bool early); +void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -628,32 +630,29 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); -extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, - unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data); -extern void irq_domain_free_irqs_common(struct irq_domain *domain, - unsigned int virq, - unsigned int nr_irqs); -extern void irq_domain_free_irqs_top(struct irq_domain *domain, - unsigned int virq, unsigned int nr_irqs); - -extern int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); -extern int irq_domain_pop_irq(struct irq_domain *domain, int virq); - -extern int irq_domain_alloc_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); - -extern void irq_domain_free_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs); - -extern int irq_domain_disconnect_hierarchy(struct irq_domain *domain, +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, + unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data); +void irq_domain_free_irqs_common(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs); +void irq_domain_free_irqs_top(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs); + +int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); +int irq_domain_pop_irq(struct irq_domain *domain, int virq); + +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg); + +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs); + +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) diff --git a/include/linux/log2.h b/include/linux/log2.h index 9f30d087a1281..1366cb688a6d9 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -41,7 +41,7 @@ int __ilog2_u64(u64 n) * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ -static inline __attribute__((const)) +static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 483aa90242cdf..75e8850cec3ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -879,10 +879,11 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of concurrency IDs allocated. + * @max_nr_cid: Maximum number of allowed concurrency + * IDs allocated. * - * Track the highest number of concurrency IDs allocated for the - * mm. + * Track the highest number of allowed concurrency IDs + * allocated for the mm. */ atomic_t max_nr_cid; /** diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa98..9937e71a3b5bf 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -370,7 +370,6 @@ enum mod_mem_type { struct module_memory { void *base; - void *rw_copy; bool is_rox; unsigned int size; @@ -772,16 +771,6 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); -void *__module_writable_address(struct module *mod, void *loc); - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || - mod->state != MODULE_STATE_UNFORMED) - return loc; - return __module_writable_address(mod, loc); -} - #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -889,11 +878,6 @@ static inline bool module_is_coming(struct module *mod) { return false; } - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 1f5507ba5a128..e395461d59e5e 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,10 +108,6 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod); - #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00ea..9abef442c1463 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -558,11 +558,21 @@ enum { MSI_FLAG_NO_AFFINITY = (1 << 21), }; +/* + * Flags for msi_parent_ops::chip_flags + */ +enum { + MSI_CHIP_FLAG_SET_EOI = (1 << 0), + MSI_CHIP_FLAG_SET_ACK = (1 << 1), +}; + /** * struct msi_parent_ops - MSI parent domain callbacks and configuration info * * @supported_flags: Required: The supported MSI flags of the parent domain * @required_flags: Optional: The required MSI flags of the parent MSI domain + * @chip_flags: Optional: Select MSI chip callbacks to update with defaults + * in msi_lib_init_dev_msi_info(). * @bus_select_token: Optional: The bus token of the real parent domain for * irq_domain::select() * @bus_select_mask: Optional: A mask of supported BUS_DOMAINs for @@ -575,6 +585,7 @@ enum { struct msi_parent_ops { u32 supported_flags; u32 required_flags; + u32 chip_flags; u32 bus_select_token; u32 bus_select_mask; const char *prefix; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 71fbebfa43c7e..9ac83ca883266 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -47,6 +47,7 @@ struct nfs4_acl { struct nfs4_label { uint32_t lfs; uint32_t pi; + u32 lsmid; u32 len; char *label; }; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a8dfb38c9bb6f..e78fa535f61dd 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,7 +17,6 @@ void lockup_detector_init(void); void lockup_detector_retry_init(void); void lockup_detector_soft_poweroff(void); -void lockup_detector_cleanup(void); extern int watchdog_user_enabled; extern int watchdog_thresh; @@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_retry_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } -static inline void lockup_detector_cleanup(void) { } #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); -extern void hardlockup_detector_perf_cleanup(void); extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } -static inline void hardlockup_detector_perf_cleanup(void) { } static inline void hardlockup_config_perf_event(const char *str) { } #endif diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index e07e8978d691b..e435250fcb4d0 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -13,6 +13,8 @@ #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 #define NVME_TCP_MIN_MAXH2CDATA 4096 +#define NVME_TCP_MIN_C2HTERM_PLEN 24 +#define NVME_TCP_MAX_C2HTERM_PLEN 152 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdcf..2dc05b1c3283d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { diff --git a/include/linux/objtool.h b/include/linux/objtool.h index c722a921165ba..3ca965a2ddc80 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -128,7 +128,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) +#define __ASM_ANNOTATE(label, type) "" #define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 @@ -147,6 +147,8 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b609..0fcacb9097789 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -26,13 +26,11 @@ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" #endif -#define PER_CPU_FIRST_SECTION "..first" #else #define PER_CPU_SHARED_ALIGNED_SECTION "" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" -#define PER_CPU_FIRST_SECTION "" #endif @@ -115,14 +113,17 @@ DEFINE_PER_CPU_SECTION(type, name, "") /* - * Declaration/definition used for per-CPU variables that must come first in - * the set of variables. + * Declaration/definition used for per-CPU variables that are frequently + * accessed and should be in a single cacheline. + * + * For use only by architecture and core code. Only use scalar or pointer + * types to maximize density. */ -#define DECLARE_PER_CPU_FIRST(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DECLARE_PER_CPU_CACHE_HOT(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..hot.." #name) -#define DEFINE_PER_CPU_FIRST(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DEFINE_PER_CPU_CACHE_HOT(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..hot.." #name) /* * Declaration/definition used for per-CPU variables that must be cacheline diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a96..76f4265efee92 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,8 +343,7 @@ struct pmu { */ unsigned int scope; - int __percpu *pmu_disable_count; - struct perf_cpu_pmu_context __percpu *cpu_pmu_context; + struct perf_cpu_pmu_context __percpu **cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -673,13 +672,15 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x01 -#define PERF_ATTACH_GROUP 0x02 -#define PERF_ATTACH_TASK 0x04 -#define PERF_ATTACH_TASK_DATA 0x08 -#define PERF_ATTACH_ITRACE 0x10 -#define PERF_ATTACH_SCHED_CB 0x20 -#define PERF_ATTACH_CHILD 0x40 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_ITRACE 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 struct bpf_prog; struct perf_cgroup; @@ -921,7 +922,7 @@ struct perf_event_pmu_context { struct list_head pinned_active; struct list_head flexible_active; - /* Used to avoid freeing per-cpu perf_event_pmu_context */ + /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; @@ -1029,6 +1030,7 @@ struct perf_cpu_pmu_context { int active_oncpu; int exclusive; + int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; @@ -1062,7 +1064,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; @@ -1646,19 +1654,10 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 } extern int sysctl_perf_event_paranoid; -extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; -extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372e..acf387d199d7b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1508,14 +1508,24 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, } /* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). + * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page + * tables copied during copy_page_range(). */ -static inline int track_pfn_copy(struct vm_area_struct *vma) +static inline int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { return 0; } +/* + * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during + * copy_page_range(), but after track_pfn_copy() was already called. + */ +static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ +} + /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or @@ -1528,8 +1538,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma, } /* - * untrack_pfn_clear is called while mremapping a pfnmap for a new region - * or fails to copy pgtable during duplicate vm area. + * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: + * + * 1) During mremap() on the src VMA after the page tables were moved. + * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { @@ -1540,7 +1552,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); -extern int track_pfn_copy(struct vm_area_struct *vma); +extern int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma); +extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a8197..b698758000f8b 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -31,6 +31,33 @@ struct pipe_buffer { unsigned long private; }; +/* + * Really only alpha needs 32-bit fields, but + * might as well do it for 64-bit architectures + * since that's what we've historically done, + * and it makes 'head_tail' always be a simple + * 'unsigned long'. + */ +#ifdef CONFIG_64BIT +typedef unsigned int pipe_index_t; +#else +typedef unsigned short pipe_index_t; +#endif + +/* + * We have to declare this outside 'struct pipe_inode_info', + * but then we can't use 'union pipe_index' for an anonymous + * union, so we end up having to duplicate this declaration + * below. Annoying. + */ +union pipe_index { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; +}; + /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing @@ -38,6 +65,7 @@ struct pipe_buffer { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -58,8 +86,16 @@ struct pipe_buffer { struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; + + /* This has to match the 'union pipe_index' above */ + union { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; + }; + unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; @@ -141,23 +177,23 @@ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) } /** - * pipe_empty - Return true if the pipe is empty + * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline bool pipe_empty(unsigned int head, unsigned int tail) +static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head == tail; + return (pipe_index_t)(head - tail); } /** - * pipe_occupancy - Return number of slots used in the pipe + * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) +static inline bool pipe_empty(unsigned int head, unsigned int tail) { - return head - tail; + return !pipe_occupancy(head, tail); } /** @@ -172,6 +208,33 @@ static inline bool pipe_full(unsigned int head, unsigned int tail, return pipe_occupancy(head, tail) >= limit; } +/** + * pipe_is_full - Return true if the pipe is full + * @pipe: the pipe + */ +static inline bool pipe_is_full(const struct pipe_inode_info *pipe) +{ + return pipe_full(pipe->head, pipe->tail, pipe->max_usage); +} + +/** + * pipe_is_empty - Return true if the pipe is empty + * @pipe: the pipe + */ +static inline bool pipe_is_empty(const struct pipe_inode_info *pipe) +{ + return pipe_empty(pipe->head, pipe->tail); +} + +/** + * pipe_buf_usage - Return how many pipe buffers are in use + * @pipe: the pipe + */ +static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe) +{ + return pipe_occupancy(pipe->head, pipe->tail); +} + /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access @@ -245,15 +308,6 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, return buf->ops->try_steal(pipe, buf); } -static inline void pipe_discard_from(struct pipe_inode_info *pipe, - unsigned int old_head) -{ - unsigned int mask = pipe->ring_size - 1; - - while (pipe->head > old_head) - pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]); -} - /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ab5b0e8eb2c1..8c9df7dadd5d3 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -33,6 +33,8 @@ enum platform_profile_option { * @probe: Callback to setup choices available to the new class device. These * choices will only be enforced when setting a new profile, not when * getting the current one. + * @hidden_choices: Callback to setup choices that are not visible to the user + * but can be set by the driver. * @profile_get: Callback that will be called when showing the current platform * profile in sysfs. * @profile_set: Callback that will be called when storing a new platform @@ -40,6 +42,7 @@ enum platform_profile_option { */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); + int (*hidden_choices)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c0..4c1af9b7e28b2 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -319,6 +319,7 @@ do { \ #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; +struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 2c8bfd0f1b6b3..6322d8c1c6b42 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref) return rcuref_get_slowpath(ref); } -extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { + int cnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ - if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + cnt = atomic_sub_return_release(1, &ref->refcnt); + if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ - return rcuref_put_slowpath(ref); + return rcuref_put_slowpath(ref, cnt); } /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d6..9c15365a30c08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1701,7 +1701,7 @@ extern struct pid *cad_pid; #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF__HOLE__00010000 0x00010000 +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 928a626725e69..b13474825130f 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -531,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & diff --git a/include/linux/sizes.h b/include/linux/sizes.h index c3a00b967d18a..49039494076fa 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -23,17 +23,25 @@ #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 +#define SZ_24K 0x00006000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 +#define SZ_192K 0x00030000 #define SZ_256K 0x00040000 +#define SZ_384K 0x00060000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 +#define SZ_3M 0x00300000 #define SZ_4M 0x00400000 +#define SZ_6M 0x00600000 #define SZ_8M 0x00800000 +#define SZ_12M 0x00c00000 #define SZ_16M 0x01000000 +#define SZ_18M 0x01200000 +#define SZ_24M 0x01800000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 diff --git a/include/linux/socket.h b/include/linux/socket.h index d18cc47e89bd0..c3322eb3d6865 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data); struct timespec64; struct __kernel_timespec; diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fec1e8a1570c3..eac57914dcf32 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -158,7 +158,6 @@ enum { RPC_TASK_NEED_XMIT, RPC_TASK_NEED_RECV, RPC_TASK_MSG_PIN_WAIT, - RPC_TASK_SIGNALLED, }; #define rpc_test_and_set_running(t) \ @@ -171,7 +170,7 @@ enum { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) /* * Task priorities. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d4513..002f4e1debe59 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -994,6 +994,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 876e31b4461d0..0b8b32bf06551 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -165,6 +165,4 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #endif -struct vdso_data *arch_get_vdso_data(void *vvar_page); - #endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b1df7d792fa16..58868407fed06 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -39,6 +40,8 @@ struct page; #define MAX_URETPROBE_DEPTH 64 +#define UPROBE_NO_TRAMPOLINE_VADDR (~0UL) + struct uprobe_consumer { /* * handler() can return UPROBE_HANDLER_REMOVE to signal the need to @@ -143,6 +146,7 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + bool signal_denied; struct arch_uprobe *auprobe; }; @@ -182,8 +186,14 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode, + int nbytes, void *data); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); @@ -191,7 +201,10 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t, bool); +extern int uprobe_write(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool orig, void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -221,8 +234,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h new file mode 100644 index 0000000000000..a91fa24b06e09 --- /dev/null +++ b/include/linux/vdso_datastore.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VDSO_DATASTORE_H +#define _LINUX_VDSO_DATASTORE_H + +#include + +extern const struct vm_special_mapping vdso_vvar_mapping; +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); + +#endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095cd..5a37cb2b6f93b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -151,6 +151,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, + DIRECT_MAP_LEVEL2_COLLAPSE, + DIRECT_MAP_LEVEL3_COLLAPSE, #endif #ifdef CONFIG_PER_VMA_LOCK_STATS VMA_LOCK_SUCCESS, diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8b..7ef728324e4e7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1751,6 +1751,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 3dc7a1551ac35..5d653a3491d07 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #define CS35L56_DEVID 0x0000000 @@ -61,6 +62,7 @@ #define CS35L56_IRQ1_MASK_8 0x000E0AC #define CS35L56_IRQ1_MASK_18 0x000E0D4 #define CS35L56_IRQ1_MASK_20 0x000E0DC +#define CS35L56_DSP_MBOX_1_RAW 0x0011000 #define CS35L56_DSP_VIRTUAL1_MBOX_1 0x0011020 #define CS35L56_DSP_VIRTUAL1_MBOX_2 0x0011024 #define CS35L56_DSP_VIRTUAL1_MBOX_3 0x0011028 @@ -224,6 +226,7 @@ #define CS35L56_HALO_STATE_SHUTDOWN 1 #define CS35L56_HALO_STATE_BOOT_DONE 2 +#define CS35L56_MBOX_CMD_PING 0x0A000000 #define CS35L56_MBOX_CMD_AUDIO_PLAY 0x0B000001 #define CS35L56_MBOX_CMD_AUDIO_PAUSE 0x0B000002 #define CS35L56_MBOX_CMD_AUDIO_REINIT 0x0B000003 @@ -254,6 +257,16 @@ #define CS35L56_NUM_BULK_SUPPLIES 3 #define CS35L56_NUM_DSP_REGIONS 5 +/* Additional margin for SYSTEM_RESET to control port ready on SPI */ +#define CS35L56_SPI_RESET_TO_PORT_READY_US (CS35L56_CONTROL_PORT_READY_US + 2500) + +struct cs35l56_spi_payload { + __be32 addr; + __be16 pad; + __be32 value; +} __packed; +static_assert(sizeof(struct cs35l56_spi_payload) == 10); + struct cs35l56_base { struct device *dev; struct regmap *regmap; @@ -269,6 +282,7 @@ struct cs35l56_base { s8 cal_index; struct cirrus_amp_cal_data cal_data; struct gpio_desc *reset_gpio; + struct cs35l56_spi_payload *spi_payload_buf; }; static inline bool cs35l56_is_otp_register(unsigned int reg) @@ -276,6 +290,23 @@ static inline bool cs35l56_is_otp_register(unsigned int reg) return (reg >> 16) == 3; } +static inline int cs35l56_init_config_for_spi(struct cs35l56_base *cs35l56, + struct spi_device *spi) +{ + cs35l56->spi_payload_buf = devm_kzalloc(&spi->dev, + sizeof(*cs35l56->spi_payload_buf), + GFP_KERNEL | GFP_DMA); + if (!cs35l56->spi_payload_buf) + return -ENOMEM; + + return 0; +} + +static inline bool cs35l56_is_spi(struct cs35l56_base *cs35l56) +{ + return IS_ENABLED(CONFIG_SPI_MASTER) && !!cs35l56->spi_payload_buf; +} + extern const struct regmap_config cs35l56_regmap_i2c; extern const struct regmap_config cs35l56_regmap_spi; extern const struct regmap_config cs35l56_regmap_sdw; diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index b0db89058c911..958a2460330c0 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -174,6 +174,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ EM(afs_cell_trace_get_queue_new, "GET q-new ") \ + EM(afs_cell_trace_get_server, "GET server") \ EM(afs_cell_trace_get_vol, "GET vol ") \ EM(afs_cell_trace_insert, "INSERT ") \ EM(afs_cell_trace_manage, "MANAGE ") \ @@ -182,6 +183,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_put_destroy, "PUT destry") \ EM(afs_cell_trace_put_queue_work, "PUT q-work") \ EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ + EM(afs_cell_trace_put_server, "PUT server") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ EM(afs_cell_trace_see_source, "SEE source") \ EM(afs_cell_trace_see_ws, "SEE ws ") \ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b13dc275ef4a7..851841336ee65 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -360,8 +360,7 @@ TRACE_EVENT(rpc_request, { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ - { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ - { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) + { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) DECLARE_EVENT_CLASS(rpc_task_running, diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b44069d29cecc..49f9f90458d8c 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -11,6 +11,7 @@ typedef __u16 Elf32_Half; typedef __u32 Elf32_Off; typedef __s32 Elf32_Sword; typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; /* 64-bit ELF base types. */ typedef __u64 Elf64_Addr; @@ -21,6 +22,7 @@ typedef __s32 Elf64_Sword; typedef __u32 Elf64_Word; typedef __u64 Elf64_Xword; typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; /* These constants are for the segment types stored in the image headers */ #define PT_NULL 0 @@ -107,6 +109,7 @@ typedef __s64 Elf64_Sxword; #define DT_VALRNGLO 0x6ffffd00 #define DT_VALRNGHI 0x6ffffdff #define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 #define DT_ADDRRNGHI 0x6ffffeff #define DT_VERSYM 0x6ffffff0 #define DT_RELACOUNT 0x6ffffff9 @@ -125,6 +128,8 @@ typedef __s64 Elf64_Sxword; #define STB_GLOBAL 1 #define STB_WEAK 2 +#define STN_UNDEF 0 + #define STT_NOTYPE 0 #define STT_OBJECT 1 #define STT_FUNC 2 @@ -133,6 +138,9 @@ typedef __s64 Elf64_Sxword; #define STT_COMMON 5 #define STT_TLS 6 +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + #define ELF_ST_BIND(x) ((x) >> 4) #define ELF_ST_TYPE(x) ((x) & 0xf) #define ELF32_ST_BIND(x) ELF_ST_BIND(x) @@ -483,4 +491,34 @@ typedef struct elf64_note { /* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + #endif /* _UAPI_LINUX_ELF_H */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e11c826385277..050fa8eb2e8f8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -380,7 +380,7 @@ enum io_uring_op { * result will be the number of buffers send, with * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers - * will be contigious from the starting buffer ID. + * will be contiguous from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 33745642f7875..e1d2c27533b49 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -268,7 +268,9 @@ struct landlock_net_port_attr { * ~~~~~~~~~~~~~~~~ * * These flags enable to restrict a sandboxed process to a set of network - * actions. This is supported since the Landlock ABI version 4. + * actions. + * + * This is supported since Landlock ABI version 4. * * The following access rights apply to TCP port numbers: * @@ -291,11 +293,13 @@ struct landlock_net_port_attr { * Setting a flag for a ruleset will isolate the Landlock domain to forbid * connections to resources outside the domain. * + * This is supported since Landlock ABI version 6. + * * Scopes: * * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from * connecting to an abstract UNIX socket created by a process outside the - * related Landlock domain (e.g. a parent domain or a non-sandboxed process). + * related Landlock domain (e.g., a parent domain or a non-sandboxed process). * - %LANDLOCK_SCOPE_SIGNAL: Restrict a sandboxed process from sending a signal * to another process outside the domain. */ diff --git a/include/vdso/align.h b/include/vdso/align.h new file mode 100644 index 0000000000000..02dd8626b5c5a --- /dev/null +++ b/include/vdso/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_ALIGN_H +#define __VDSO_ALIGN_H + +#include + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/cache.h b/include/vdso/cache.h new file mode 100644 index 0000000000000..f89d48304bf8f --- /dev/null +++ b/include/vdso/cache.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_CACHE_H +#define __VDSO_CACHE_H + +#include + +#ifndef SMP_CACHE_BYTES +#define SMP_CACHE_BYTES L1_CACHE_BYTES +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index d967baa0cd0c6..1864e76e8f691 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -9,11 +9,14 @@ #include #include +#include #include +#include #include #include #include #include +#include #include #include #include @@ -25,6 +28,15 @@ struct arch_vdso_time_data {}; #endif +#if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) +#include +#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +struct vdso_arch_data { + /* Needed for the generic code, never actually used at runtime */ + char __unused; +}; +#endif + #define VDSO_BASES (CLOCK_TAI + 1) #define VDSO_HRES (BIT(CLOCK_REALTIME) | \ BIT(CLOCK_MONOTONIC) | \ @@ -45,11 +57,11 @@ struct arch_vdso_time_data {}; * * There is one vdso_timestamp object in vvar for each vDSO-accelerated * clock_id. For high-resolution clocks, this encodes the time - * corresponding to vdso_data.cycle_last. For coarse clocks this encodes + * corresponding to vdso_time_data.cycle_last. For coarse clocks this encodes * the actual time. * * To be noticed that for highres clocks nsec is left-shifted by - * vdso_data.cs[x].shift. + * vdso_time_data[x].shift. */ struct vdso_timestamp { u64 sec; @@ -57,7 +69,7 @@ struct vdso_timestamp { }; /** - * struct vdso_data - vdso datapage representation + * struct vdso_clock - vdso per clocksource datapage representation * @seq: timebase sequence counter * @clock_mode: clock mode * @cycle_last: timebase at clocksource init @@ -67,19 +79,9 @@ struct vdso_timestamp { * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id * @offset[clock_id]: time namespace offset per clock_id - * @tz_minuteswest: minutes west of Greenwich - * @tz_dsttime: type of DST correction - * @hrtimer_res: hrtimer resolution - * @__unused: unused - * @arch_data: architecture specific data (optional, defaults - * to an empty struct) * - * vdso_data will be accessed by 64 bit and compat code at the same time - * so we should be careful before modifying this structure. - * - * The ordering of the struct members is optimized to have fast access to the - * often required struct members which are related to CLOCK_REALTIME and - * CLOCK_MONOTONIC. This information is stored in the first cache lines. + * See also struct vdso_time_data for basic access and ordering information as + * struct vdso_clock is used there. * * @basetime is used to store the base time for the system wide time getter * VVAR page. @@ -92,7 +94,7 @@ struct vdso_timestamp { * For clocks which are not affected by time namespace adjustment the * offset must be zero. */ -struct vdso_data { +struct vdso_clock { u32 seq; s32 clock_mode; @@ -108,14 +110,35 @@ struct vdso_data { struct vdso_timestamp basetime[VDSO_BASES]; struct timens_offset offset[VDSO_BASES]; }; +}; + +/** + * struct vdso_time_data - vdso datapage representation + * @arch_data: architecture specific data (optional, defaults + * to an empty struct) + * @clock_data: clocksource related data (array) + * @tz_minuteswest: minutes west of Greenwich + * @tz_dsttime: type of DST correction + * @hrtimer_res: hrtimer resolution + * @__unused: unused + * + * vdso_time_data will be accessed by 64 bit and compat code at the same time + * so we should be careful before modifying this structure. + * + * The ordering of the struct members is optimized to have fast acces to the + * often required struct members which are related to CLOCK_REALTIME and + * CLOCK_MONOTONIC. This information is stored in the first cache lines. + */ +struct vdso_time_data { + struct arch_vdso_time_data arch_data; - s32 tz_minuteswest; - s32 tz_dsttime; - u32 hrtimer_res; - u32 __unused; + struct vdso_clock clock_data[CS_BASES]; - struct arch_vdso_time_data arch_data; -}; + s32 tz_minuteswest; + s32 tz_dsttime; + u32 hrtimer_res; + u32 __unused; +} ____cacheline_aligned; /** * struct vdso_rng_data - vdso RNG state information @@ -136,22 +159,32 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden"))); - -/** - * union vdso_data_store - Generic vDSO data page - */ -union vdso_data_store { - struct vdso_data data[CS_BASES]; - u8 page[1U << CONFIG_PAGE_SHIFT]; +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE +extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); +extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); +extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); + +extern struct vdso_time_data *vdso_k_time_data; +extern struct vdso_rng_data *vdso_k_rng_data; +extern struct vdso_arch_data *vdso_k_arch_data; + +#define VDSO_ARCH_DATA_SIZE ALIGN(sizeof(struct vdso_arch_data), PAGE_SIZE) +#define VDSO_ARCH_DATA_PAGES (VDSO_ARCH_DATA_SIZE >> PAGE_SHIFT) + +enum vdso_pages { + VDSO_TIME_PAGE_OFFSET, + VDSO_TIMENS_PAGE_OFFSET, + VDSO_RNG_PAGE_OFFSET, + VDSO_ARCH_PAGES_START, + VDSO_ARCH_PAGES_END = VDSO_ARCH_PAGES_START + VDSO_ARCH_DATA_PAGES - 1, + VDSO_NR_PAGES }; +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ + /* * The generic vDSO implementation requires that gettimeofday.h * provides: - * - __arch_get_vdso_data(): to get the vdso datapage. * - __arch_get_hw_counter(): to get the hw counter based on the * clock_mode. * - gettimeofday_fallback(): fallback for gettimeofday. @@ -164,6 +197,27 @@ union vdso_data_store { #include #endif /* ENABLE_COMPAT_VDSO */ +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_VDSO_GETRANDOM +#define __vdso_u_rng_data PROVIDE(vdso_u_rng_data = vdso_u_data + 2 * PAGE_SIZE); +#else +#define __vdso_u_rng_data +#endif + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +#define __vdso_u_arch_data PROVIDE(vdso_u_arch_data = vdso_u_data + 3 * PAGE_SIZE); +#else +#define __vdso_u_arch_data +#endif + +#define VDSO_VVAR_SYMS \ + PROVIDE(vdso_u_data = . - __VDSO_PAGES * PAGE_SIZE); \ + PROVIDE(vdso_u_time_data = vdso_u_data); \ + __vdso_u_rng_data \ + __vdso_u_arch_data \ + + #endif /* !__ASSEMBLY__ */ #endif /* __VDSO_DATAPAGE_H */ diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 3ddb03bb05cbe..0a98fed550ba6 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -7,49 +7,53 @@ #include #include -static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) +static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) { u32 seq; - while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) cpu_relax(); smp_rmb(); return seq; } -static __always_inline u32 vdso_read_retry(const struct vdso_data *vd, +static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, u32 start) { u32 seq; smp_rmb(); - seq = READ_ONCE(vd->seq); + seq = READ_ONCE(vc->seq); return seq != start; } -static __always_inline void vdso_write_begin(struct vdso_data *vd) +static __always_inline void vdso_write_begin(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); smp_wmb(); } -static __always_inline void vdso_write_end(struct vdso_data *vd) +static __always_inline void vdso_write_end(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + smp_wmb(); /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); } #endif /* !__ASSEMBLY__ */ diff --git a/init/Kconfig b/init/Kconfig index d0d021b3fa3b3..a0ea04c177842 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1869,11 +1869,6 @@ config KALLSYMS_ALL Say N unless you really need all symbols, or kernel live patching. -config KALLSYMS_ABSOLUTE_PERCPU - bool - depends on KALLSYMS - default X86_64 && SMP - # end of the "standard kernel features (expert users)" menu config ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 5d0928f37471e..91019b4d03088 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -64,7 +64,7 @@ struct io_worker { union { struct rcu_head rcu; - struct work_struct work; + struct delayed_work work; }; }; @@ -770,6 +770,18 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err) } } +static void queue_create_worker_retry(struct io_worker *worker) +{ + /* + * We only bother retrying because there's a chance that the + * failure to create a worker is due to some temporary condition + * in the forking task (e.g. outstanding signal); give the task + * some time to clear that condition. + */ + schedule_delayed_work(&worker->work, + msecs_to_jiffies(worker->init_retries * 5)); +} + static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; @@ -809,12 +821,13 @@ static void create_worker_cont(struct callback_head *cb) /* re-create attempts grab a new worker ref, drop the existing one */ io_worker_release(worker); - schedule_work(&worker->work); + queue_create_worker_retry(worker); } static void io_workqueue_create(struct work_struct *work) { - struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_worker *worker = container_of(work, struct io_worker, + work.work); struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) @@ -855,8 +868,8 @@ static bool create_io_worker(struct io_wq *wq, int index) kfree(worker); goto fail; } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); + INIT_DELAYED_WORK(&worker->work, io_workqueue_create); + queue_create_worker_retry(worker); } return true; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ceacf6230e342..6427d71c773b6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2045,6 +2045,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return io_init_fail_req(req, -EINVAL); } + opcode = array_index_nospec(opcode, IORING_OP_LAST); + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ @@ -2421,7 +2423,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) goto out_wake; } - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: diff --git a/io_uring/net.c b/io_uring/net.c index 17852a6616ffe..5d0b56ff50eed 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -322,7 +322,9 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, if (unlikely(ret)) return ret; - return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + sr->msg_control = iomsg->msg.msg_control_user; + return ret; } #endif diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 190f7ee45de93..89ea0135a1a0d 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,12 +4,6 @@ #include -#define IO_NODE_ALLOC_CACHE_MAX 32 - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, diff --git a/io_uring/rw.c b/io_uring/rw.c index 7aa1e4c9f64a3..e5528cebcd066 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -23,6 +23,9 @@ #include "poll.h" #include "rw.h" +static void io_complete_rw(struct kiocb *kiocb, long res); +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -289,6 +292,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; + if (req->ctx->flags & IORING_SETUP_IOPOLL) + rw->kiocb.ki_complete = io_complete_rw_iopoll; + else + rw->kiocb.ki_complete = io_complete_rw; + rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); @@ -552,19 +560,20 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return; - } - req->cqe.res = res; + else + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ smp_store_release(&req->iopoll_completed, 1); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + /* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return; @@ -586,8 +595,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } - INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, - io_complete_rw, kiocb, ret); + if (req->ctx->flags & IORING_SETUP_IOPOLL) + io_complete_rw_iopoll(&rw->kiocb, ret); + else + io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -598,7 +609,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { + if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_complete_rw_common(req, ret); /* * Safe to call io_end from here as we're inline @@ -609,7 +620,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_rw_cleanup(req, issue_flags); return IOU_OK; } else { - io_rw_done(&rw->kiocb, ret); + io_rw_done(req, ret); } return IOU_ISSUE_SKIP_COMPLETE; @@ -813,10 +824,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; - kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; - kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { /* make sure every req only blocks once*/ @@ -826,7 +835,6 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; - kiocb->ki_complete = io_complete_rw; } if (req->flags & REQ_F_HAS_METADATA) { @@ -904,7 +912,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || + (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -977,6 +986,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (!io_file_can_poll(req)) return -EBADFD; + /* make it sync, multishot doesn't support async execution */ + rw->kiocb.ki_complete = NULL; ret = __io_read(req, issue_flags); /* diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 48fc8cf707843..c5fb817b1e28e 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -407,8 +407,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; + hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } @@ -430,8 +429,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, data->ts = *ts; list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } @@ -557,7 +555,6 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; @@ -568,6 +565,10 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; + hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data), + data->mode); + } else { + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode); } return 0; } @@ -627,7 +628,6 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; @@ -646,7 +646,6 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5449756ba102e..3ddfaa9eed5be 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3303a3605ee80..6e13af5e32812 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22006,12 +22006,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_get_smp_processor_id && verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever + * optimization, so if cpu_number is ever * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad7767176..19be796395420 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index fbe34299673d3..10b63433f0573 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -220,60 +220,32 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool) { struct page_counter *climit; - struct cgroup_subsys_state *css, *next_css; + struct cgroup_subsys_state *css; struct dmemcg_state *dmemcg_iter; - struct dmem_cgroup_pool_state *pool, *parent_pool; - bool found_descendant; + struct dmem_cgroup_pool_state *pool, *found_pool; climit = &limit_pool->cnt; rcu_read_lock(); - parent_pool = pool = limit_pool; - css = &limit_pool->cs->css; - /* - * This logic is roughly equivalent to css_foreach_descendant_pre, - * except we also track the parent pool to find out which pool we need - * to calculate protection values for. - * - * We can stop the traversal once we find test_pool among the - * descendants since we don't really care about any others. - */ - while (pool != test_pool) { - next_css = css_next_child(NULL, css); - if (next_css) { - parent_pool = pool; - } else { - while (css != &limit_pool->cs->css) { - next_css = css_next_child(css, css->parent); - if (next_css) - break; - css = css->parent; - parent_pool = pool_parent(parent_pool); - } - /* - * We can only hit this when test_pool is not a - * descendant of limit_pool. - */ - if (WARN_ON_ONCE(css == &limit_pool->cs->css)) - break; - } - css = next_css; - - found_descendant = false; + css_for_each_descendant_pre(css, &limit_pool->cs->css) { dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { - if (pool_parent(pool) == parent_pool) { - found_descendant = true; + if (pool->region == limit_pool->region) { + found_pool = pool; break; } } - if (!found_descendant) + if (!found_pool) continue; page_counter_calculate_protection( - climit, &pool->cnt, true); + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; } rcu_read_unlock(); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 07455d25329c9..ad755db29efd4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1453,11 +1453,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); - /* - * Do post unplug cleanup. This is still protected against - * concurrent CPU hotplug via cpu_add_remove_lock. - */ - lockup_detector_cleanup(); arch_smt_update(); return ret; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8a47e52a454f4..6c83ad674d010 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -22,6 +22,7 @@ struct callchain_cpus_entries { int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; +static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { @@ -266,12 +267,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, return entry; } -/* - * Used for sysctl_perf_event_max_stack and - * sysctl_perf_event_max_contexts_per_stack. - */ -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int perf_event_max_stack_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; @@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write, return ret; } + +static const struct ctl_table callchain_sysctl_table[] = { + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static int __init init_callchain_sysctls(void) +{ + register_sysctl_init("kernel", callchain_sysctl_table); + return 0; +} +core_initcall(init_callchain_sysctls); + diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e1..868edeedc7dec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -452,8 +452,8 @@ static struct kmem_cache *perf_event_cache; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -463,6 +463,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -484,7 +485,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -506,9 +507,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,6 +527,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -1147,8 +1192,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1172,23 +1217,35 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); +} + +static inline void perf_pmu_read(struct perf_event *event) +{ + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } static void get_ctx(struct perf_event_context *ctx) @@ -2303,7 +2360,7 @@ static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2444,9 +2501,8 @@ __perf_remove_from_context(struct perf_event *event, pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2584,7 +2640,7 @@ static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2691,7 +2747,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3314,9 +3370,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3473,8 +3528,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -3564,7 +3618,7 @@ static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); @@ -3673,7 +3727,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3685,7 +3739,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3802,7 +3856,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3931,10 +3985,9 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } @@ -4618,15 +4671,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -4950,7 +4996,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { @@ -4961,11 +5007,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, */ struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { - atomic_set(&epc->refcount, 1); + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; @@ -5007,12 +5056,19 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: @@ -5034,6 +5090,15 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc) WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc->epc.task_ctx_data); + kfree(cpc); +} + static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); @@ -5068,8 +5133,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc) raw_spin_unlock_irqrestore(&ctx->lock, flags); - if (epc->embedded) + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); return; + } call_rcu(&epc->rcu_head, free_epc_rcu); } @@ -5239,6 +5306,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5246,14 +5315,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5285,8 +5353,7 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); static void perf_pending_task_sync(struct perf_event *event) { @@ -5312,40 +5379,20 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void _free_event(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - irq_work_sync(&event->pending_irq); - irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); - - unaccount_event(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); - security_perf_event_free(event); + kfree(event->addr_filter_ranges); - if (event->rb) { - /* - * Can happen when we close an event with re-directed output. - * - * Since we have a 0 refcount, perf_mmap_close() will skip - * over us; possibly making our ring_buffer_put() the last. - */ - mutex_lock(&event->mmap_mutex); - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - } + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); - if (event->destroy) event->destroy(event); @@ -5356,22 +5403,60 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); + } /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (event->pmu) + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ +static void _free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); + + unaccount_event(event); + + security_perf_event_free(event); + + if (event->rb) { + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + __free_event(event); +} + /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. @@ -5962,14 +6047,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); @@ -6608,7 +6694,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6626,9 +6712,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + goto unlock; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + goto unlock; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6637,16 +6768,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) */ u64 aux_offset, aux_size; - if (!event->rb) - return -EINVAL; - - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; - rb = event->rb; if (!rb) goto aux_unlock; @@ -6687,48 +6808,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; - - goto accounting; - } - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; } - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -6796,6 +6877,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) rb->aux_mmap_locked = extra; } + ret = 0; + unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -6820,7 +6903,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!ret) ret = map_range(rb, vma); - if (event->pmu->event_mapped) + if (!ret && event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; @@ -7444,9 +7527,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7459,9 +7541,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7520,6 +7601,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -8321,7 +8405,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); @@ -8580,7 +8665,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strscpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -9024,7 +9109,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strscpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -9448,7 +9533,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strscpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -10831,6 +10916,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; @@ -10929,6 +11017,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. @@ -11367,8 +11466,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -11605,11 +11703,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11672,7 +11765,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); @@ -11815,62 +11908,107 @@ static int pmu_dev_alloc(struct pmu *pmu) free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) { - int cpu, ret, max = PERF_TYPE_MAX; + void *tmp, *val = idr_find(idr, id); - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (val != old) + return false; - pmu->type = -1; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free_pdc; + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + +static void perf_pmu_free(struct pmu *pmu) +{ + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); } - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free_pdc; + if (pmu->cpu_pmu_context) { + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; + + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} + +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - type = ret; - pmu->type = type; + pmu->type = pmu_type.id; + atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + return ret; } - ret = -ENOMEM; - pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); if (!pmu->cpu_pmu_context) - goto free_dev; + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + + if (!cpc) + return -ENOMEM; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } @@ -11903,33 +12041,25 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); - } -free_idr: - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); + scoped_guard (mutex, &pmus_lock) { + list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + } /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11938,16 +12068,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); - } - free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + perf_pmu_free(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -11987,48 +12108,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; + + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; + + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; - if (ret && event->destroy) - event->destroy(event); + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) - module_put(pmu->module); + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain @@ -12041,7 +12175,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12060,13 +12194,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12075,27 +12208,21 @@ static struct pmu *perf_init_event(struct perf_event *event) } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -12222,7 +12349,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12237,8 +12363,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12345,65 +12471,53 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; - } + if (IS_ERR(pmu)) + return (void*)pmu; /* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err_pmu; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err_pmu; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err_per_task; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12427,42 +12541,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); - - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6dc..8ec2cb6889038 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 180509132d4b6..59a52b1a1f789 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465f..a6108bd0b8d78 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -250,21 +250,22 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } -static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +static void uprobe_copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -278,7 +279,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -417,7 +418,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); @@ -471,47 +472,46 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) + unsigned long vaddr, uprobe_opcode_t opcode, bool orig) +{ + return uprobe_write(auprobe, mm, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode, orig, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, uprobe_write_verify_t verify, bool orig, void *data) { - struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; - int ret, is_register, ref_ctr_updated = 0; + int ret; bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; - is_register = is_swbp_insn(&opcode); - uprobe = container_of(auprobe, struct uprobe, arch); - retry: - if (is_register) + if (!orig) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); if (IS_ERR(old_page)) return PTR_ERR(old_page); - ret = verify_opcode(old_page, vaddr, &opcode); + ret = verify(old_page, vaddr, insn, nbytes, data); if (ret <= 0) goto put_old; - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { + if (is_zero_page(old_page)) { ret = -EINVAL; goto put_old; } - /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { - ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; - - ref_ctr_updated = 1; + if (WARN(orig && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; } ret = 0; - if (!is_register && !PageAnon(old_page)) + if (orig && !PageAnon(old_page)) goto put_old; ret = anon_vma_prepare(vma); @@ -525,9 +525,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, __SetPageUptodate(new_page); copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_to_page(new_page, vaddr, insn, nbytes); - if (!is_register) { + if (orig) { struct page *orig_page; pgoff_t index; @@ -560,10 +560,6 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, if (unlikely(ret == -EAGAIN)) goto retry; - /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); - /* try collapse pmd for compound page */ if (!ret && orig_page_huge) collapse_pte_mapped_thp(mm, vaddr, false); @@ -582,7 +578,26 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN, false); +} + +static int set_swbp_refctr(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +{ + int err; + + /* We are going to replace instruction, update ref_ctr. */ + if (uprobe->ref_ctr_offset) { + err = update_ref_ctr(uprobe, mm, 1); + if (err) + return err; + } + + err = set_swbp(&uprobe->arch, mm, vaddr); + + /* Revert back reference counter if instruction update failed. */ + if (err && uprobe->ref_ctr_offset) + update_ref_ctr(uprobe, mm, -1); + return err; } /** @@ -598,7 +613,17 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, true); +} + +static int set_orig_refctr(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +{ + int err = set_orig_insn(&uprobe->arch, mm, vaddr); + + /* Revert back reference counter even if instruction update failed. */ + if (uprobe->ref_ctr_offset) + update_ref_ctr(uprobe, mm, -1); + return err; } /* uprobe should have guaranteed positive refcount */ @@ -762,10 +787,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { @@ -1027,7 +1056,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1133,7 +1162,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp_refctr(uprobe, mm, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1146,7 +1175,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_refctr(uprobe, mm, vaddr); } struct map_info { @@ -1371,7 +1400,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), uprobe_copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1437,7 +1466,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1454,7 +1483,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1699,7 +1728,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1731,7 +1760,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1765,6 +1794,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1776,6 +1813,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -1860,7 +1899,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ - copy_to_page(page, vaddr, src, len); + uprobe_copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. @@ -2160,8 +2199,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) */ unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ @@ -2302,9 +2341,8 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; @@ -2356,7 +2394,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2640,6 +2678,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2704,6 +2746,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2715,6 +2760,28 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + rcu_read_lock_trace(); + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + goto unlock; + + if (!get_utask()) + goto unlock; + + if (arch_uprobe_ignore(&uprobe->arch, regs)) + goto unlock; + + handler_chain(uprobe, regs); + + unlock: + rcu_read_unlock_trace(); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. @@ -2737,9 +2804,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); diff --git a/kernel/fork.c b/kernel/fork.c index 735405a9c5f32..2720e42508a9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -504,6 +504,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(new->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(new); + return new; } @@ -1260,6 +1264,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } @@ -1891,8 +1896,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 3db8567f5a44e..cca15859a50be 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -50,10 +50,10 @@ */ static struct { struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) /* @@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[hash & futex_hashmask]; } @@ -1127,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk) static int __init futex_init(void) { + unsigned long hashsize, i; unsigned int futex_shift; - unsigned long i; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, + hashsize, 0, 0, &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + hashsize, hashsize); + hashsize = 1UL << futex_shift; - for (i = 0; i < futex_hashsize; i++) { + for (i = 0; i < hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } + futex_hashmask = hashsize - 1; return 0; } core_initcall(futex_init); diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c3..75e61c1c6bc07 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c901436ebd9f4..0ff987d3a7991 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a979523640d0a..b0290849c3956 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2878307397833..4258cd6bd3b4f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980f..2861f89880af5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f300bb6be3bd4..753eef8e041cb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); @@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq) * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -2789,8 +2791,7 @@ void teardown_percpu_nmi(unsigned int irq) irq_put_desc_unlock(desc, flags); } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f6..147cabb4c0779 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56b..fa92882efdb1f 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -756,12 +757,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irq_data_get_msi_desc(irqd); + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a8..4198f30aac3ca 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8a..a114949eeed5e 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840aa..9ef9850aeebe6 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -67,3 +67,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f02269..b15757e636261 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include #include #include +#include #include #include "lockdep_internals.h" +#include "lock_events.h" #include @@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -5091,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5824,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1b..19b636f60a24f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbbd..c80902eacd797 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c38a2d2d4a7ee..78dd3d8c65544 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -59,8 +59,8 @@ struct rt_mutex_waiter { }; /** - * rt_wake_q_head - Wrapper around regular wake_q_head to support - * "sleeping" spinlocks on RT + * struct rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT * @head: The regular wake_q_head for sleeping lock variants * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 34bfae72f2952..de9117c0e671e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); -static noinline void __up(struct semaphore *sem); +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); /** * down - acquire the semaphore @@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout); void __sched up(struct semaphore *sem) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else - __up(sem); + __up(sem, &wake_q); raw_spin_unlock_irqrestore(&sem->lock, flags); + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); } EXPORT_SYMBOL(up); @@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct semaphore *sem, + struct wake_q_head *wake_q) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); waiter->up = true; - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } diff --git a/kernel/module/main.c b/kernel/module/main.c index 1fb9ad289a6f8..a256cc919ad74 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1221,18 +1221,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -void *__module_writable_address(struct module *mod, void *loc) -{ - for_class_mod_mem_type(type, text) { - struct module_memory *mem = &mod->mem[type]; - - if (loc >= mem->base && loc < mem->base + mem->size) - return loc + (mem->rw_copy - mem->base); - } - - return loc; -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1250,21 +1238,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; - mod->mem[type].base = ptr; - if (execmem_is_rox(execmem_type)) { - ptr = vzalloc(size); + int err = execmem_make_temp_rw(ptr, size); - if (!ptr) { - execmem_free(mod->mem[type].base); + if (err) { + execmem_free(ptr); return -ENOMEM; } - mod->mem[type].rw_copy = ptr; mod->mem[type].is_rox = true; - } else { - mod->mem[type].rw_copy = mod->mem[type].base; - memset(mod->mem[type].base, 0, size); } /* @@ -1278,18 +1260,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) * *do* eventually get freed, but let's just keep things simple * and avoid *any* false positives. */ - kmemleak_not_leak(ptr); + if (!mod->mem[type].is_rox) + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; return 0; } +static void module_memory_restore_rox(struct module *mod) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) + execmem_restore_rox(mem->base, mem->size); + } +} + static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - if (mem->is_rox) - vfree(mem->rw_copy); - execmem_free(mem->base); } @@ -2642,7 +2635,6 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; - mod->mem[type].rw_copy = NULL; continue; } @@ -2659,7 +2651,6 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; const char *sname; - unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; @@ -2680,14 +2671,12 @@ static int move_module(struct module *mod, struct load_info *info) ret = PTR_ERR(dest); goto out_err; } - addr = (unsigned long)dest; codetag_section_found = true; } else { enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + dest = mod->mem[type].base + offset; } if (shdr->sh_type != SHT_NOBITS) { @@ -2710,13 +2699,14 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = addr; + shdr->sh_addr = (unsigned long)dest; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; out_err: + module_memory_restore_rox(mod); for (t--; t >= 0; t--) module_memory_free(mod, t); if (codetag_section_found) @@ -2863,17 +2853,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } -int __weak module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - static int post_relocation(struct module *mod, const struct load_info *info) { - int ret; - /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2885,24 +2866,7 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - ret = module_finalize(info->hdr, info->sechdrs, mod); - if (ret) - return ret; - - for_each_mod_mem_type(type) { - struct module_memory *mem = &mod->mem[type]; - - if (mem->is_rox) { - if (!execmem_update_copy(mem->base, mem->rw_copy, - mem->size)) - return -ENOMEM; - - vfree(mem->rw_copy); - mem->rw_copy = NULL; - } - } - - return module_post_finalize(info->hdr, info->sechdrs, mod); + return module_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ @@ -3499,6 +3463,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->mem[type].size); } + module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: /* diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 74834ba15615f..03f4142cfbf4e 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" static int module_set_memory(const struct module *mod, enum mod_mem_type type, @@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type, int module_enable_text_rox(const struct module *mod) { for_class_mod_mem_type(type, text) { + const struct module_memory *mem = &mod->mem[type]; int ret; - if (mod->mem[type].is_rox) - continue; - - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + if (mem->is_rox) + ret = execmem_restore_rox(mem->base, mem->size); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else ret = module_set_memory(mod, type, set_memory_x); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 8f6cfec87555a..7098ed44e717d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,7 +107,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - ns->pid_max = parent_pid_ns->pid_max; + ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; diff --git a/kernel/rseq.c b/kernel/rseq.c index 442aba29bc4cf..b7a1ec327e811 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,24 +78,24 @@ static int rseq_validate_ro_fields(struct task_struct *t) return -EFAULT; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ - rseq_kernel_fields(t)->cpu_id_start = cpu_id; - rseq_kernel_fields(t)->cpu_id = cpu_id; - rseq_kernel_fields(t)->node_id = node_id; - rseq_kernel_fields(t)->mm_cid = mm_cid; -} +/* + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent + * state. + */ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ + } while (0) + #else static int rseq_validate_ro_fields(struct task_struct *t) { return 0; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ -} +#define rseq_unsafe_put_user(t, value, field, error_label) \ + unsafe_put_user(value, &t->rseq->field, error_label) #endif /* @@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t) WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); - rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -195,6 +196,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { + struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) - return -EFAULT; - /* - * Reset cpu_id_start to its initial state (0). - */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; - /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. - */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; - /* - * Reset node_id to its initial state (0). - */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + goto efault; + + if (!user_write_access_begin(rseq, t->rseq_len)) + goto efault; + /* - * Reset mm_cid to its initial state (0). + * Reset all fields to their initial state. + * + * All fields have an initial state of 0 except cpu_id which is set to + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after + * unregistration can figure out that rseq needs to be registered + * again. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; - - rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ + user_write_access_end(); + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} + +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + return 0; } +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -507,9 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of @@ -521,6 +554,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) return -EFAULT; #endif + /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9aecd914ac691..dc12791cdb559 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -916,8 +915,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -7285,7 +7283,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd08791..6dab4854c6c08 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,8 +9,6 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING -DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); - /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -24,14 +22,16 @@ DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); +int sched_clock_irqtime; + void enable_sched_clock_irqtime(void) { - static_branch_enable(&sched_clock_irqtime); + sched_clock_irqtime = 1; } void disable_sched_clock_irqtime(void) { - static_branch_disable(&sched_clock_irqtime); + sched_clock_irqtime = 0; } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 38e4537790af7..7ea9c040c72f0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -3189,7 +3187,7 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); if (dl_bw_visited(cpu, gen)) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5a81d9a1e31f2..0f1da199cfc7c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3117,7 +3117,6 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; - bool prev_on_scx = prev->sched_class == &ext_sched_class; bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; bool kick_idle = false; @@ -3137,14 +3136,18 @@ static struct task_struct *pick_task_scx(struct rq *rq) * if pick_task_scx() is called without preceding balance_scx(). */ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev_on_scx) { + if (prev->scx.flags & SCX_TASK_QUEUED) { keep_prev = true; } else { keep_prev = false; kick_idle = true; } - } else if (unlikely(keep_prev && !prev_on_scx)) { - /* only allowed during transitions */ + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); keep_prev = false; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c0ef435a7aae..ae0350088ac1a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -74,10 +74,10 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -883,6 +883,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } +/* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + /* * Earliest Eligible Virtual Deadline First * @@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -4045,15 +4061,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { struct cfs_rq *prev_cfs_rq; struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); if (cfs_rq->on_list) { prev = cfs_rq->leaf_cfs_rq_list.prev; } else { - struct rq *rq = rq_of(cfs_rq); - prev = rq->tmp_alone_branch; } + if (prev == &rq->leaf_cfs_rq_list) + return false; + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); return (prev_cfs_rq->tg->parent == cfs_rq->tg); @@ -5528,11 +5546,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); @@ -6539,14 +6554,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6989,6 +7004,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable += h_nr_runnable; @@ -7118,6 +7135,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable -= h_nr_runnable; @@ -8781,8 +8800,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8801,8 +8827,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. @@ -9415,12 +9441,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4b8e33c615b12..7a9ed4d933977 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b93c8c3dc05a5..023b844159c94 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3259,11 +3259,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); +extern int sched_clock_irqtime; static inline int irqtime_enabled(void) { - return static_branch_likely(&sched_clock_irqtime); + return sched_clock_irqtime; } /* @@ -3698,10 +3698,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3709,8 +3727,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 456d339be98fb..9f40348f1dc7a 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; @@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, struct sched_attr attr; int retval; - if (!uattr || pid < 0 || flags) + if (unlikely(!uattr || pid < 0 || flags)) return -EINVAL; retval = sched_copy_attr(uattr, &attr); @@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) struct task_struct *p; int retval; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; scoped_guard (rcu) { @@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, struct task_struct *p; int retval; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) return -EINVAL; scoped_guard (rcu) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7bbb408431ebc..9b26fc1fbbe85 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -750,13 +750,21 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe +#if defined __NR_uretprobe || defined __NR_uprobe #ifdef SECCOMP_ARCH_COMPAT if (sd->arch == SECCOMP_ARCH_NATIVE) #endif + { +#ifdef __NR_uretprobe if (sd->nr == __NR_uretprobe) return true; #endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1034,6 +1042,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6..bf5d05c635ffd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb57da499ebb1..4ebe6136b08d3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include @@ -91,12 +90,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ - -#ifdef CONFIG_PERF_EVENTS -static const int six_hundred_forty_kb = 640 * 1024; -#endif - - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1831,16 +1824,6 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - #if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ defined(CONFIG_DEBUG_STACKOVERFLOW) { @@ -1851,43 +1834,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_MMU) { .procname = "randomize_va_space", @@ -1906,15 +1852,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1932,63 +1869,6 @@ static const struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_event_max_sample_rate_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, #endif { .procname = "panic_on_warn", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index deb1aa32814e3..22376a1a75b9f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -1587,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952af..e3642278df433 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b62..b837d3d9d3253 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c0..7f4e4fb7381ef 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a98..df498745628cc 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -381,7 +381,7 @@ static void posix_timer_unhash_and_free(struct k_itimer *tmr) static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -747,7 +747,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +756,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -1099,8 +1098,10 @@ void exit_itimers(struct task_struct *tsk) spin_unlock_irq(&tsk->sighand->siglock); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) + while (!hlist_empty(&timers)) { itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1efa..cc15fe293719f 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a15..a88b72b0f35ee 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c1..c527b421c8652 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d3831431658..01c2ab1e89719 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; @@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk) /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13bef2462e94b..05bedb0f1919d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1039,27 +1039,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2560b312ad576..33082c4e8154e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -403,13 +403,12 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) lockdep_assert_held(&fprobe_mutex); fprobe_graph_active--; - if (!fprobe_graph_active) { - /* Q: should we unregister it ? */ + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) unregister_ftrace_graph(&fprobe_graph_ops); - return; - } - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } static int symbols_cmp(const void *a, const void *b) @@ -679,8 +678,7 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - if (count) - fprobe_graph_remove_ips(addrs, count); + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 728ecda6e8d4d..fc88e0688daf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -540,6 +540,7 @@ static int function_stat_show(struct seq_file *m, void *v) static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif guard(mutex)(&ftrace_profile_lock); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -3220,15 +3217,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3292,16 +3296,18 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has /* Return a new hash that has a union of all @ops->filter_hash entries */ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) { - struct ftrace_hash *new_hash; + struct ftrace_hash *new_hash = NULL; struct ftrace_ops *subops; + int size_bits; int ret; - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; + if (ops->func_hash->filter_hash) + size_bits = ops->func_hash->filter_hash->size_bits; + else + size_bits = FTRACE_HASH_DEFAULT_BITS; list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); + ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); if (ret < 0) { free_ftrace_hash(new_hash); return NULL; @@ -3310,7 +3316,8 @@ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) if (ftrace_hash_empty(new_hash)) break; } - return new_hash; + /* Can't return NULL as that means this failed */ + return new_hash ? : EMPTY_HASH; } /* Make @ops trace evenything except what all its subops do not trace */ @@ -3505,7 +3512,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); if (!filter_hash) return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); + ret = append_hash(&filter_hash, subops->func_hash->filter_hash, + size_bits); if (ret < 0) { free_ftrace_hash(filter_hash); return ret; @@ -5707,6 +5715,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4cb275316e51e..513de9ceb80ef 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1591,6 +1591,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) return iter; #endif + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } @@ -1667,9 +1674,9 @@ static int s_show(struct seq_file *m, void *v) } #endif -static void s_stop(struct seq_file *m, void *p) +static void s_stop(struct seq_file *m, void *v) { - kfree(p); + kfree(v); t_stop(m, NULL); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 261163b00137a..ad7419e240556 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6724,27 +6724,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b8f3c4ba309b6..e27305d31fc57 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1049,6 +1049,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -1056,6 +1069,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -1215,6 +1230,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_return && tf->tp.entry_arg) { tf->fp.entry_handler = trace_fprobe_entry_handler; tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164d..df56f9b760109 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -216,7 +216,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); - trace_ctx = tracing_gen_ctx(); + trace_ctx = tracing_gen_ctx_dec(); data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) @@ -321,7 +321,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -347,8 +346,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); trace_function(tr, ip, parent_ip, trace_ctx); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index f3a2722ee4c07..15999532956ab 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1901,8 +1901,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -2456,8 +2455,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5803e6a415705..96792bc4b0924 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -36,7 +36,6 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -481,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -544,7 +544,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 8800f5acc0071..2ef2e1b800916 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) - return NULL; + return ERR_PTR(-ENOMEM); init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return NULL; + return ERR_PTR(PTR_ERR(tsk)); } vtsk->task = tsk; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2da7de39d06d..9fa2af9dbf2ce 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -static void __lockup_detector_cleanup(void); - #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, @@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); @@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void) watchdog_hardlockup_start(); cpus_read_unlock(); - /* - * Must be called outside the cpus locked section to prevent - * recursive locking in the perf code. - */ - __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) @@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void) } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_cleanup(void) -{ - lockdep_assert_held(&watchdog_mutex); - hardlockup_detector_perf_cleanup(); -} - -/** - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes - * - * Caller must not hold the cpu hotplug rwsem. - */ -void lockup_detector_cleanup(void) -{ - mutex_lock(&watchdog_mutex); - __lockup_detector_cleanup(); - mutex_unlock(&watchdog_mutex); -} - /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 59c1d86a73a24..a78ff092d6360 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -21,8 +21,6 @@ #include static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); @@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void) PTR_ERR(evt)); return PTR_ERR(evt); } + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); this_cpu_write(watchdog_ev, evt); return 0; } @@ -181,36 +180,12 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (event) { perf_event_disable(event); + perf_event_release_kernel(event); this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); } } -/** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - /** * hardlockup_detector_perf_stop - Globally stop watchdog events * diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97152f2250fe7..bfe030b443e27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2254,8 +2254,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq)))) + WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", + work->func, wq->name))) { return; + } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1af972a92d06f..35796c290ca35 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2103,7 +2103,7 @@ config FAIL_SKB_REALLOC reallocated, catching possible invalid pointers to the skb. For more information, check - Documentation/dev-tools/fault-injection/fault-injection.rst + Documentation/fault-injection/fault-injection.rst config FAULT_INJECTION_CONFIGFS bool "Configfs interface for fault-injection capabilities" diff --git a/lib/Makefile b/lib/Makefile index d5cfc7afbbb82..fcdee83deb5c6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -132,7 +132,7 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) -obj-y += math/ crypto/ +obj-y += math/ crypto/ vdso/ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 759ea1783cc59..d726068358c7d 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -254,7 +254,7 @@ static __init int test_atomics_init(void) pr_info("passed for %s platform %s CX8 and %s SSE\n", #ifdef CONFIG_X86_64 "x86-64", -#elif defined(CONFIG_X86_CMPXCHG64) +#elif defined(CONFIG_X86_CX8) "i586+", #else "i386+", diff --git a/lib/rcuref.c b/lib/rcuref.c index 97f300eca927c..5bd726b71e393 100644 --- a/lib/rcuref.c +++ b/lib/rcuref.c @@ -220,6 +220,7 @@ EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count + * @cnt: The resulting value of the fastpath decrement * * Invoked when the reference count is outside of the valid zone. * @@ -233,10 +234,8 @@ EXPORT_SYMBOL_GPL(rcuref_get_slowpath); * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ -bool rcuref_put_slowpath(rcuref_t *ref) +bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt) { - unsigned int cnt = atomic_read(&ref->refcnt); - /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 896c0131c9a8c..8f688187fa872 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -190,8 +190,7 @@ static int ot_init_hrtimer(struct ot_item *item, unsigned long hrtimer) return -ENOENT; item->hrtcycle = ktime_set(0, hrtimer * 1000000UL); - hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrt->function = ot_hrtimer_handler; + hrtimer_setup(hrt, ot_hrtimer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 82fe827af5426..45df764b49ad6 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -43,3 +43,8 @@ config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). + +config GENERIC_VDSO_DATA_STORE + bool + help + Selected by architectures that use the generic vDSO data store. diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index cedbf15f80874..aedd40aaa950c 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,18 +1,3 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: GPL-2.0-only -GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) - -c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) -c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) - -# This cmd checks that the vdso library does not contain dynamic relocations. -# It has to be called after the linking of the vdso library and requires it -# as a parameter. -# -# As a workaround for some GNU ld ports which produce unneeded R_*_NONE -# dynamic relocations, ignore R_*_NONE. -quiet_cmd_vdso_check = VDSOCHK $@ - cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ - then (echo >&2 "$@: dynamic relocations are not supported"; \ - rm -f $@; /bin/false); fi +obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o diff --git a/lib/vdso/Makefile.include b/lib/vdso/Makefile.include new file mode 100644 index 0000000000000..cedbf15f80874 --- /dev/null +++ b/lib/vdso/Makefile.include @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) + +c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) +c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) + +# This cmd checks that the vdso library does not contain dynamic relocations. +# It has to be called after the linking of the vdso library and requires it +# as a parameter. +# +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. +quiet_cmd_vdso_check = VDSOCHK $@ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ + then (echo >&2 "$@: dynamic relocations are not supported"; \ + rm -f $@; /bin/false); fi diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c new file mode 100644 index 0000000000000..c715e217ec657 --- /dev/null +++ b/lib/vdso/datastore.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include + +/* + * The vDSO data page. + */ +#ifdef CONFIG_HAVE_GENERIC_VDSO +static union { + struct vdso_time_data data; + u8 page[PAGE_SIZE]; +} vdso_time_data_store __page_aligned_data; +struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; +static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + +#ifdef CONFIG_VDSO_GETRANDOM +static union { + struct vdso_rng_data data; + u8 page[PAGE_SIZE]; +} vdso_rng_data_store __page_aligned_data; +struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data; +static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE); +#endif /* CONFIG_VDSO_GETRANDOM */ + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +static union { + struct vdso_arch_data data; + u8 page[VDSO_ARCH_DATA_SIZE]; +} vdso_arch_data_store __page_aligned_data; +struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data; +#endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long addr, pfn; + vm_fault_t err; + + switch (vmf->pgoff) { + case VDSO_TIME_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } + break; + case VDSO_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + break; + case VDSO_RNG_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + break; + case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: + if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + + vmf->pgoff - VDSO_ARCH_PAGES_START; + break; + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +const struct vm_special_mapping vdso_vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr) +{ + return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, + VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, + &vdso_vvar_mapping); +} + +#ifdef CONFIG_TIME_NS +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma_pages(vma); + } + mmap_read_unlock(mm); + + return 0; +} +#endif diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index 938ca539aaa64..440f8a6203a69 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -12,6 +12,9 @@ #include #include +/* Bring in default accessors */ +#include + #undef PAGE_SIZE #undef PAGE_MASK #define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT) @@ -152,7 +155,7 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_ /* * Prevent the syscall from being reordered wrt current_generation. Pairs with the - * smp_store_release(&_vdso_rng_data.generation) in random.c. + * smp_store_release(&vdso_k_rng_data->generation) in random.c. */ smp_rmb(); @@ -256,5 +259,6 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_ static __always_inline ssize_t __cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) { - return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len); + return __cvdso_getrandom_data(__arch_get_vdso_u_rng_data(), buffer, len, flags, + opaque_state, opaque_len); } diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index c01eaafd80419..93ef801a97ef2 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -5,6 +5,9 @@ #include #include +/* Bring in default accessors */ +#include + #ifndef vdso_calc_ns #ifdef VDSO_DELTA_NOMASK @@ -14,12 +17,12 @@ #endif #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT -static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta) +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) { - return delta < vd->max_cycles; + return delta < vc->max_cycles; } #else -static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta) +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) { return true; } @@ -36,14 +39,14 @@ static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift) * Default implementation which works for all sane clocksources. That * obviously excludes x86/TSC. */ -static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base) +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) { - u64 delta = (cycles - vd->cycle_last) & VDSO_DELTA_MASK(vd); + u64 delta = (cycles - vc->cycle_last) & VDSO_DELTA_MASK(vc); - if (likely(vdso_delta_ok(vd, delta))) - return vdso_shift_ns((delta * vd->mult) + base, vd->shift); + if (likely(vdso_delta_ok(vc, delta))) + return vdso_shift_ns((delta * vc->mult) + base, vc->shift); - return mul_u64_u32_add_u64_shr(delta, vd->mult, base, vd->shift); + return mul_u64_u32_add_u64_shr(delta, vc->mult, base, vc->shift); } #endif /* vdso_calc_ns */ @@ -55,9 +58,9 @@ static inline bool __arch_vdso_hres_capable(void) #endif #ifndef vdso_clocksource_ok -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { - return vd->clock_mode != VDSO_CLOCKMODE_NONE; + return vc->clock_mode != VDSO_CLOCKMODE_NONE; } #endif @@ -69,36 +72,45 @@ static inline bool vdso_cycles_ok(u64 cycles) #endif #ifdef CONFIG_TIME_NS -static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) + +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE +static __always_inline +const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) +{ + return (void *)vd + PAGE_SIZE; +} +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ + +static __always_inline +int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; const struct vdso_timestamp *vdso_ts; - const struct vdso_data *vd; u64 cycles, ns; u32 seq; s64 sec; - vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE); - vd = __arch_get_timens_vdso_data(vd); if (clk != CLOCK_MONOTONIC_RAW) - vd = &vd[CS_HRES_COARSE]; + vc = &vc[CS_HRES_COARSE]; else - vd = &vd[CS_RAW]; - vdso_ts = &vd->basetime[clk]; + vc = &vc[CS_RAW]; + vdso_ts = &vc->basetime[clk]; do { - seq = vdso_read_begin(vd); + seq = vdso_read_begin(vc); - if (unlikely(!vdso_clocksource_ok(vd))) + if (unlikely(!vdso_clocksource_ok(vc))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode, vd); + cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; - ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec); + ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ sec += offs->sec; @@ -115,22 +127,24 @@ static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_ } #else static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return NULL; } -static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { return -EINVAL; } #endif -static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u64 cycles, sec, ns; u32 seq; @@ -142,31 +156,31 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, /* * Open coded function vdso_read_begin() to handle * VDSO_CLOCKMODE_TIMENS. Time namespace enabled tasks have a - * special VVAR page installed which has vd->seq set to 1 and - * vd->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time + * special VVAR page installed which has vc->seq set to 1 and + * vc->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time * namespace affected tasks this does not affect performance - * because if vd->seq is odd, i.e. a concurrent update is in - * progress the extra check for vd->clock_mode is just a few - * extra instructions while spin waiting for vd->seq to become + * because if vc->seq is odd, i.e. a concurrent update is in + * progress the extra check for vc->clock_mode is just a few + * extra instructions while spin waiting for vc->seq to become * even again. */ - while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_hres_timens(vd, clk, ts); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_hres_timens(vd, vc, clk, ts); cpu_relax(); } smp_rmb(); - if (unlikely(!vdso_clocksource_ok(vd))) + if (unlikely(!vdso_clocksource_ok(vc))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode, vd); + cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; - ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec); + ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* * Do this outside the loop: a race inside the loop could result @@ -179,21 +193,25 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, } #ifdef CONFIG_TIME_NS -static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_data *vd = __arch_get_timens_vdso_data(vdns); - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; - const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; + const struct vdso_timestamp *vdso_ts; u64 nsec; s64 sec; s32 seq; + vdso_ts = &vc->basetime[clk]; + do { - seq = vdso_read_begin(vd); + seq = vdso_read_begin(vc); sec = vdso_ts->sec; nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ sec += offs->sec; @@ -208,17 +226,19 @@ static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clocki return 0; } #else -static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { return -1; } #endif -static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u32 seq; do { @@ -226,25 +246,26 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, * Open coded function vdso_read_begin() to handle * VDSO_CLOCK_TIMENS. See comment in do_hres(). */ - while ((seq = READ_ONCE(vd->seq)) & 1) { + while ((seq = READ_ONCE(vc->seq)) & 1) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_coarse_timens(vd, clk, ts); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_coarse_timens(vd, vc, clk, ts); cpu_relax(); } smp_rmb(); ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); return 0; } static __always_inline int -__cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { + const struct vdso_clock *vc = vd->clock_data; u32 msk; /* Check for negative values or invalid clocks */ @@ -257,19 +278,19 @@ __cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock, */ msk = 1U << clock; if (likely(msk & VDSO_HRES)) - vd = &vd[CS_HRES_COARSE]; + vc = &vc[CS_HRES_COARSE]; else if (msk & VDSO_COARSE) - return do_coarse(&vd[CS_HRES_COARSE], clock, ts); + return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts); else if (msk & VDSO_RAW) - vd = &vd[CS_RAW]; + vc = &vc[CS_RAW]; else return -1; - return do_hres(vd, clock, ts); + return do_hres(vd, vc, clock, ts); } static __maybe_unused int -__cvdso_clock_gettime_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { int ret = __cvdso_clock_gettime_common(vd, clock, ts); @@ -282,12 +303,12 @@ __cvdso_clock_gettime_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { - return __cvdso_clock_gettime_data(__arch_get_vdso_data(), clock, ts); + return __cvdso_clock_gettime_data(__arch_get_vdso_u_time_data(), clock, ts); } #ifdef BUILD_VDSO32 static __maybe_unused int -__cvdso_clock_gettime32_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; @@ -308,19 +329,20 @@ __cvdso_clock_gettime32_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) { - return __cvdso_clock_gettime32_data(__arch_get_vdso_data(), clock, res); + return __cvdso_clock_gettime32_data(__arch_get_vdso_u_time_data(), clock, res); } #endif /* BUILD_VDSO32 */ static __maybe_unused int -__cvdso_gettimeofday_data(const struct vdso_data *vd, +__cvdso_gettimeofday_data(const struct vdso_time_data *vd, struct __kernel_old_timeval *tv, struct timezone *tz) { + const struct vdso_clock *vc = vd->clock_data; if (likely(tv != NULL)) { struct __kernel_timespec ts; - if (do_hres(&vd[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) + if (do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) return gettimeofday_fallback(tv, tz); tv->tv_sec = ts.tv_sec; @@ -329,8 +351,8 @@ __cvdso_gettimeofday_data(const struct vdso_data *vd, if (unlikely(tz != NULL)) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; @@ -342,20 +364,23 @@ __cvdso_gettimeofday_data(const struct vdso_data *vd, static __maybe_unused int __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { - return __cvdso_gettimeofday_data(__arch_get_vdso_data(), tv, tz); + return __cvdso_gettimeofday_data(__arch_get_vdso_u_time_data(), tv, tz); } #ifdef VDSO_HAS_TIME static __maybe_unused __kernel_old_time_t -__cvdso_time_data(const struct vdso_data *vd, __kernel_old_time_t *time) +__cvdso_time_data(const struct vdso_time_data *vd, __kernel_old_time_t *time) { + const struct vdso_clock *vc = vd->clock_data; __kernel_old_time_t t; if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = vd->clock_data; + } - t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + t = READ_ONCE(vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) *time = t; @@ -365,15 +390,16 @@ __cvdso_time_data(const struct vdso_data *vd, __kernel_old_time_t *time) static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { - return __cvdso_time_data(__arch_get_vdso_data(), time); + return __cvdso_time_data(__arch_get_vdso_u_time_data(), time); } #endif /* VDSO_HAS_TIME */ #ifdef VDSO_HAS_CLOCK_GETRES static __maybe_unused -int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, +int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { + const struct vdso_clock *vc = vd->clock_data; u32 msk; u64 ns; @@ -382,8 +408,8 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, return -1; if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); /* * Convert the clockid to a bitmask and use it to check which @@ -394,7 +420,7 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, /* * Preserves the behaviour of posix_get_hrtimer_res(). */ - ns = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); + ns = READ_ONCE(vd->hrtimer_res); } else if (msk & VDSO_COARSE) { /* * Preserves the behaviour of posix_get_coarse_res(). @@ -412,7 +438,7 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, } static __maybe_unused -int __cvdso_clock_getres_data(const struct vdso_data *vd, clockid_t clock, +int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { int ret = __cvdso_clock_getres_common(vd, clock, res); @@ -425,12 +451,12 @@ int __cvdso_clock_getres_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) { - return __cvdso_clock_getres_data(__arch_get_vdso_data(), clock, res); + return __cvdso_clock_getres_data(__arch_get_vdso_u_time_data(), clock, res); } #ifdef BUILD_VDSO32 static __maybe_unused int -__cvdso_clock_getres_time32_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; @@ -451,7 +477,7 @@ __cvdso_clock_getres_time32_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) { - return __cvdso_clock_getres_time32_data(__arch_get_vdso_data(), + return __cvdso_clock_getres_time32_data(__arch_get_vdso_u_time_data(), clock, res); } #endif /* BUILD_VDSO32 */ diff --git a/mm/compaction.c b/mm/compaction.c index 12ed8425fa175..a3203d97123ea 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3181,6 +3181,7 @@ static int kcompactd(void *p) long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; + current->flags |= PF_KCOMPACTD; set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3237,6 +3238,8 @@ static int kcompactd(void *p) pgdat->proactive_compact_trigger = false; } + current->flags &= ~PF_KCOMPACTD; + return 0; } diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be0..e6c4f5076ca8d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c1..2974691fdfad2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -445,7 +445,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on - * @end: last (non-inclusive) index for writeback + * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. @@ -2897,8 +2897,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2955,7 +2954,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3015,7 +3014,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 163190e89ea16..97930d44d460e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2943,6 +2943,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) return ret; } +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv @@ -5447,7 +5455,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5622,7 +5630,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); diff --git a/mm/internal.h b/mm/internal.h index 10a8b4b3b86ec..6ca8036ab896d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1115,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else -static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { + return -EBUSY; } #endif diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3ea50f09311fd..3df45c25c1f62 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, size -= to_go; } } +EXPORT_SYMBOL_GPL(kmsan_handle_dma); void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 995a15eb67e2c..327e02fdc029d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - struct address_space *mapping; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + + if (folio_test_swapcache(folio)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && + mapping_can_writeback(mapping)) { + if (folio_mkclean(folio)) { + folio_set_dirty(folio); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) if (!mapping) { pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", folio_pfn(folio)); - return; + return -EBUSY; } try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); @@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) } else { try_to_unmap(folio, ttu); } + + return folio_mapped(folio) ? -EBUSY : 0; } /* @@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; - struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; @@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, if (!folio_mapped(folio)) return true; - if (folio_test_swapcache(folio)) { - pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu &= ~TTU_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - * XXX: the dirty test could be racy: set_page_dirty() may not always - * be called inside page lock (it's recommended but not enforced). - */ - mapping = folio_mapping(folio); - if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && - mapping_can_writeback(mapping)) { - if (folio_mkclean(folio)) { - folio_set_dirty(folio); - } else { - ttu &= ~TTU_HWPOISON; - pr_info("%#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - unmap_poisoned_folio(folio, ttu); - - unmap_success = !folio_mapped(folio); + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", pfn, folio_mapcount(folio)); diff --git a/mm/memory.c b/mm/memory.c index 3b5d0ebd2a02d..c157d1e1e3b18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1379,11 +1379,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); + ret = track_pfn_copy(dst_vma, src_vma); if (ret) return ret; } @@ -1420,7 +1416,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1430,6 +1425,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) + untrack_pfn_copy(dst_vma, src_vma); return ret; } @@ -3051,8 +3048,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -5183,7 +5182,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5222,7 +5225,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5258,9 +5262,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); @@ -6964,10 +6968,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3655f07dd6e3..16cf9e17077e3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1822,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (folio_test_large(folio)) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (keep the unmap as the catch all safety net). - */ + if (!folio_try_get(folio)) + continue; + + if (unlikely(page_folio(page) != folio)) + goto put_folio; + if (folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); - if (folio_mapped(folio)) - unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); - continue; - } - - if (!folio_try_get(folio)) - continue; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } - if (unlikely(page_folio(page) != folio)) goto put_folio; + } if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0904315d4da2..6c57fc1cddd3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4304,6 +4304,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: compaction_retries = 0; no_progress_loops = 0; + compact_result = COMPACT_SKIPPED; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); @@ -5931,11 +5932,10 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; - bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear || empty) + if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c608e9d728655..a051a29e95ad0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -607,6 +607,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone; int ret; + /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + /* * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free * pages are not aligned to pageblock_nr_pages. diff --git a/mm/percpu.c b/mm/percpu.c index ac61e3fc5f151..7b5835356d1e1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3071,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3240,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/shmem.c b/mm/shmem.c index 4ea6109a80431..1ede0800e8461 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1548,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -2253,7 +2253,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio *folio = NULL; bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2272,10 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int order = xa_get_order(&mapping->i_pages, index); bool fallback_order0 = false; - int split_order; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2339,6 +2338,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } alloced: @@ -2346,7 +2368,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -3487,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3514,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3601,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); diff --git a/mm/slab_common.c b/mm/slab_common.c index 4030907b6b7d8..c55106169e7be 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1304,6 +1304,8 @@ module_param(rcu_min_cached_objs, int, 0444); static int rcu_delay_page_cache_fill_msec = 5000; module_param(rcu_delay_page_cache_fill_msec, int, 0444); +static struct workqueue_struct *rcu_reclaim_wq; + /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 @@ -1632,10 +1634,10 @@ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); + mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); + queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); } static void @@ -1733,7 +1735,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) // "free channels", the batch can handle. Break // the loop since it is done with this CPU thus // queuing an RCU work is _always_ success here. - queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); WARN_ON_ONCE(!queued); break; } @@ -1883,12 +1885,12 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, + queue_delayed_work(rcu_reclaim_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); } } @@ -2120,6 +2122,10 @@ void __init kvfree_rcu_init(void) int i, j; struct shrinker *kfree_rcu_shrinker; + rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_reclaim_wq); + /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { diff --git a/mm/swapfile.c b/mm/swapfile.c index ba19430dd4ead..df7c4e8b089ca 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -653,7 +653,8 @@ static void relocate_cluster(struct swap_info_struct *si, return; if (!ci->count) { - free_cluster(si, ci); + if (ci->flags != CLUSTER_FLAG_FREE) + free_cluster(si, ci); } else if (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG) move_cluster(si, ci, &si->frag_clusters[ci->order], @@ -858,6 +859,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) offset++; } + /* in case no swap cache is reclaimed */ + if (ci->flags == CLUSTER_FLAG_NONE) + relocate_cluster(si, ci); + unlock_cluster(ci); if (to_scan <= 0) break; @@ -2641,7 +2646,6 @@ static void wait_for_allocation(struct swap_info_struct *si) for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { ci = lock_cluster(si, offset); unlock_cluster(ci); - offset += SWAPFILE_CLUSTER; } } @@ -3542,6 +3546,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) int err, i; si = swp_swap_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); diff --git a/mm/truncate.c b/mm/truncate.c index e2e115adfbc58..76d8fcd89bd00 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,8 +548,6 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (folio_test_dirty(folio)) - return 0; if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af3dfc3633dbe..d06453fa8abae 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,7 @@ #include #include #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -1076,16 +1077,14 @@ static int move_present_pte(struct mm_struct *mm, return err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; - double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1094,6 +1093,16 @@ static int move_swap_pte(struct mm_struct *mm, return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1141,6 +1150,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, __u64 mode) { swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; @@ -1240,6 +1250,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1259,14 +1270,28 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, goto out; } + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1282,8 +1307,8 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) @@ -1308,8 +1333,8 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); @@ -1322,11 +1347,13 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio); } else { + struct folio *folio = NULL; + entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; @@ -1335,9 +1362,53 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl); + if (!pte_swp_exclusive(orig_src_pte)) { + err = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + err = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_large(folio)) { + err = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio); } out: @@ -1354,6 +1425,8 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); return err; } diff --git a/mm/vma.c b/mm/vma.c index af1d549b179c9..96bcb372c90e4 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1509,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; + unsigned long start = vmg->start; + unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; + if (vmg_nomem(vmg)) + return ERR_PTR(-ENOMEM); /* Split any preceding portion of the VMA. */ - if (vma->vm_start < vmg->start) { - int err = split_vma(vmg->vmi, vma, vmg->start, 1); + if (vma->vm_start < start) { + int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ - if (vma->vm_end > vmg->end) { - int err = split_vma(vmg->vmi, vma, vmg->end, 0); + if (vma->vm_end > end) { + int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a6e7acebe9adf..61981ee1c9d2f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) - return err; + break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); - return 0; + return err; } /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 16bfe1c694dd4..88998725f1c5a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", diff --git a/mm/zswap.c b/mm/zswap.c index ac9d299e7d0c1..23365e76a3ce3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index e45187b882206..41be38264493d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -131,7 +131,8 @@ int vlan_check_real_dev(struct net_device *real_dev, { const char *name = real_dev->name; - if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { + if (real_dev->features & NETIF_F_VLAN_CHALLENGED || + real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fec11e576f310..b22078b679726 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -632,7 +632,8 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_hold(conn->hcon); - list_add(&chan->list, &conn->chan_l); + /* Append to the list since the order matters for ECRED */ + list_add_tail(&chan->list, &conn->chan_l); } void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) @@ -3771,7 +3772,11 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) struct l2cap_ecred_conn_rsp *rsp_flex = container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr); - if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + /* Check if channel for outgoing connection or if it wasn't deferred + * since in those cases it must be skipped. + */ + if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) || + !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags)) return; /* Reset ident so only one response is sent */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f53304cb09dbe..621c555f639be 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9660,6 +9660,9 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) + eir_precalc_len(sizeof(conn->dev_class))); + if (!skb) + return; + ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, &conn->dst); ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type); @@ -10413,6 +10416,8 @@ void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0)); + if (!skb) + return; ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); diff --git a/net/can/bcm.c b/net/can/bcm.c index 217049fa496e9..526cb6cd901fb 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1011,13 +1011,12 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_tx_timeout_handler; + hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); @@ -1192,13 +1191,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_rx_timeout_handler; - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->thrtimer.function = bcm_rx_thr_handler; + hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); diff --git a/net/can/isotp.c b/net/can/isotp.c index 16046931542a5..442c343afe1f7 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1634,12 +1634,10 @@ static int isotp_init(struct sock *sk) so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); - hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->rxtimer.function = isotp_rx_timer_handler; - hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txtimer.function = isotp_tx_timer_handler; - hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txfrtimer.function = isotp_txfr_timer_handler; + hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 4866879016021..39844f14eed86 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -158,8 +158,8 @@ struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) ecu->addr = J1939_IDLE_ADDR; ecu->name = name; - hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - ecu->ac_timer.function = j1939_ecu_timer_handler; + hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); INIT_LIST_HEAD(&ecu->list); j1939_priv_get(priv); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 9b72d118d756d..fbf5c8001c9df 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1511,12 +1511,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv, skcb = j1939_skb_to_cb(skb); memcpy(&session->skcb, skcb, sizeof(session->skcb)); - hrtimer_init(&session->txtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->txtimer.function = j1939_tp_txtimer; - hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->rxtimer.function = j1939_tp_rxtimer; + hrtimer_setup(&session->txtimer, j1939_tp_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&session->rxtimer, j1939_tp_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n", __func__, session, skcb->addr.sa, skcb->addr.da); diff --git a/net/core/dev.c b/net/core/dev.c index 1b252e9459fdb..03a7f867c7b34 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2141,21 +2141,15 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - /* rtnl_net_lock() assumes dev is not yet published by - * register_netdevice(). - */ - DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); - - rtnl_net_lock(net); - err = __register_netdevice_notifier_net(net, nb, false); + rtnl_net_dev_lock(dev); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); return err; } @@ -4763,7 +4757,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS @@ -7019,8 +7013,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); diff --git a/net/core/gro.c b/net/core/gro.c index 78b320b631744..0ad549b07e039 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -653,6 +653,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { diff --git a/net/core/scm.c b/net/core/scm.c index 4f6a14babe5ae..733c0cbd393d2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -282,6 +282,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7b03b64fdcb27..b1c81687e9d82 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6033,11 +6033,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } diff --git a/net/core/sock.c b/net/core/sock.c index eae2ae70a2e03..6c0e87f97fa4a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2246,6 +2246,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } @@ -2270,6 +2271,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2301,14 +2303,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2405,6 +2421,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ad2741f1346af..c7769ee0d9c55 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -587,7 +588,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index f22051f33868a..84096f6b0236e 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -72,8 +72,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); - phydev = ethnl_req_get_phydev(&req_info, - tb[ETHTOOL_A_CABLE_TEST_HEADER], + phydev = ethnl_req_get_phydev(&req_info, tb, + ETHTOOL_A_CABLE_TEST_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; @@ -339,8 +339,8 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; rtnl_lock(); - phydev = ethnl_req_get_phydev(&req_info, - tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + phydev = ethnl_req_get_phydev(&req_info, tb, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index d88e9080643b8..b97374b508f67 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "netlink.h" #include "common.h" @@ -771,6 +772,21 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack) +{ + memset(param, 0, sizeof(*param)); + memset(kparam, 0, sizeof(*kparam)); + + param->cmd = ETHTOOL_GRINGPARAM; + dev->ethtool_ops->get_ringparam(dev, param, kparam, extack); + + /* Driver gives us current state, we want to return current config */ + kparam->tcp_data_split = dev->cfg->hds_config; +} + static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) { memset(info, 0, sizeof(*info)); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 58e9e7db06f90..a1088c2441d0a 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -51,6 +51,12 @@ int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); + +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7609ce2b2c5e2..1c3ba2247776b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2059,8 +2059,8 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; + struct ethtool_ringparam ringparam, max; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) @@ -2069,7 +2069,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; - dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); + ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index af19e1bed303f..05a5f72c99fab 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -103,7 +103,7 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_LINKSTATE_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER, info->extack); if (IS_ERR(phydev)) { ret = PTR_ERR(phydev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index b4c45207fa32e..734849a573691 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -211,7 +211,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, - const struct nlattr *header, + struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack) { struct phy_device *phydev; @@ -225,8 +225,8 @@ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); - if (!phydev) { - NL_SET_ERR_MSG_ATTR(extack, header, + if (!phydev && tb) { + NL_SET_ERR_MSG_ATTR(extack, tb[header], "no phy matching phyindex"); return ERR_PTR(-ENODEV); } diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index ff69ca0715dea..ec6ab5443a6f2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -275,7 +275,8 @@ static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, * if any. Must be called under rntl_lock(). * @req_info: The ethnl request to get the phy from. - * @header: The netlink header, used for error reporting. + * @tb: The netlink attributes array, for error reporting. + * @header: The netlink header index, used for error reporting. * @extack: The netlink extended ACK, for error reporting. * * The caller must hold RTNL, until it's done interacting with the returned @@ -289,7 +290,7 @@ static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) * is returned. */ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, - const struct nlattr *header, + struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack); /** diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index ed8f690f6bac8..e067cc234419d 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -125,7 +125,7 @@ static int ethnl_phy_parse_request(struct ethnl_req_info *req_base, struct phy_req_info *req_info = PHY_REQINFO(req_base); struct phy_device *phydev; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PHY_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PHY_HEADER, extack); if (!phydev) return 0; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index d95d92f173a6d..e1f7820a6158f 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -62,7 +62,7 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { @@ -152,7 +152,7 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) bool mod = false; int ret; - phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) @@ -211,7 +211,7 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 2819e2ba6be2d..4f6b99eab2a6c 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -64,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PSE_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER, info->extack); if (IS_ERR(phydev)) return -ENODEV; @@ -261,7 +261,7 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PSE_HEADER], + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PSE_HEADER, info->extack); ret = ethnl_set_pse_validate(phydev, info); if (ret) diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 7839bfd1ac6a0..aeedd5ec6b8cd 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -215,17 +215,16 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; + struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; + struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; - dev->ethtool_ops->get_ringparam(dev, &ringparam, - &kernel_ringparam, info->extack); - kernel_ringparam.tcp_data_split = dev->cfg->hds_config; + ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, + info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index 038a2558f0520..3ca8eb2a3b314 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -138,7 +138,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_STATS_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STATS_HEADER, info->extack); if (IS_ERR(phydev)) return PTR_ERR(phydev); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 6b76c05caba4d..f6a67109beda1 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -309,7 +309,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return 0; } - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_HEADER_FLAGS], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS, info->extack); /* phydev can be NULL, check for errors only */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 285678d8ce077..57df7c1d2faac 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2457,14 +2457,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2518,16 +2516,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b089b08e96178..dfdb7a4608a85 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -815,12 +815,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - /* TODO: We probably should defer ts_recent change once - * we take ownership of @req. - */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -869,6 +863,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2308665b51c53..2dfac79dc78b8 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -13,12 +13,15 @@ #include #include -static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, +static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb, unsigned int seq, unsigned int mss) { + u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP; + u32 ts_seq = skb_shinfo(gso_skb)->tskey; + while (skb) { if (before(ts_seq, seq + mss)) { - skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tx_flags |= flags; skb_shinfo(skb)->tskey = ts_seq; return; } @@ -193,8 +196,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); - if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) - tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP)) + tcp_gso_tstamp(segs, gso_skb, seq, mss); newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9a..e7a75afa995d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -884,11 +884,9 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); - hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); - tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); - hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; + hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a5be6e4ed326f..ecfca59f31f13 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -321,13 +321,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, /* clear destructor to avoid skb_segment assigning it to tail */ copy_dtor = gso_skb->destructor == sock_wfree; - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = NULL; + gso_skb->sk = NULL; + } segs = skb_segment(gso_skb, features); if (IS_ERR_OR_NULL(segs)) { - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = sock_wfree; + gso_skb->sk = sk; + } return segs; } diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index ff7e734e335b0..7d574f5132e2f 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -88,13 +88,15 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) { + /* cache only if we don't create a dst reference loop */ + if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); local_bh_enable(); } } + skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 0ac4283acdf20..7c05ac846646f 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -262,10 +262,18 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); @@ -280,7 +288,9 @@ static int rpl_input(struct sk_buff *skb) if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 33833b2064c07..51583461ae29b 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -472,10 +472,18 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); @@ -490,7 +498,9 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 06fb8e6944b06..7a0cae9a81114 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -24,7 +24,7 @@ #include #include #include - +#include /** * llc_sap_action_unitdata_ind - forward UI PDU to network layer @@ -40,6 +40,26 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) return 0; } +static int llc_prepare_and_xmit(struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct sk_buff *nskb; + int rc; + + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (rc) + return rc; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + + return dev_queue_xmit(nskb); +} + /** * llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer * @sap: SAP @@ -52,17 +72,12 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } /** @@ -77,17 +92,12 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } /** @@ -133,17 +143,12 @@ int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb) int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_test_cmd(skb); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 299d38e9e8630..35349a7f16cb4 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -116,8 +116,14 @@ void drv_remove_interface(struct ieee80211_local *local, sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; - /* Remove driver debugfs entries */ - ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links); + /* + * Remove driver debugfs entries. + * The virtual monitor interface doesn't get a debugfs + * entry, so it's exempt here. + */ + if (sdata != rcu_access_pointer(local->monitor_sdata)) + ieee80211_debugfs_recreate_netdev(sdata, + sdata->vif.valid_links); trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0ea7e77860b73..738de269e13f0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1206,16 +1206,17 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) return; } - RCU_INIT_POINTER(local->monitor_sdata, NULL); - mutex_unlock(&local->iflist_mtx); - - synchronize_net(); - + clear_bit(SDATA_STATE_RUNNING, &sdata->state); ieee80211_link_release_channel(&sdata->deflink); if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) drv_remove_interface(local, sdata); + RCU_INIT_POINTER(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + + synchronize_net(); + kfree(sdata); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f8d52b3b0d0e4..36a9be9a66c8e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4959,6 +4959,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; parse_params.bss = cbss; + parse_params.link_id = -1; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index cd318c1c67bec..6da39c864f45b 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -47,6 +47,9 @@ struct ieee80211_elems_parse { /* The EPCS Multi-Link element in the original elements */ const struct element *ml_epcs_elem; + bool multi_link_inner; + bool skip_vendor; + /* * scratch buffer that can be used for various element parsing related * tasks, e.g., element de-fragmentation etc. @@ -152,12 +155,11 @@ ieee80211_parse_extension_element(u32 *crc, switch (le16_get_bits(mle->control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: - if (elems_parse->ml_basic_elem) { + if (elems_parse->multi_link_inner) { elems->parse_error |= IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC; break; } - elems_parse->ml_basic_elem = elem; break; case IEEE80211_ML_CONTROL_TYPE_RECONF: elems_parse->ml_reconf_elem = elem; @@ -399,6 +401,9 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_VENDOR_SPECIFIC: + if (elems_parse->skip_vendor) + break; + if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { /* Microsoft OUI (00:50:F2) */ @@ -866,21 +871,36 @@ ieee80211_mle_get_sta_prof(struct ieee80211_elems_parse *elems_parse, } } -static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, - struct ieee80211_elems_parse_params *params) +static const struct element * +ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elems_parse_params *params, + struct ieee80211_elems_parse_params *sub) { struct ieee802_11_elems *elems = &elems_parse->elems; struct ieee80211_mle_per_sta_profile *prof; - struct ieee80211_elems_parse_params sub = { - .mode = params->mode, - .action = params->action, - .from_ap = params->from_ap, - .link_id = -1, - }; - ssize_t ml_len = elems->ml_basic_len; - const struct element *non_inherit = NULL; + const struct element *tmp; + ssize_t ml_len; const u8 *end; + if (params->mode < IEEE80211_CONN_MODE_EHT) + return NULL; + + for_each_element_extid(tmp, WLAN_EID_EXT_EHT_MULTI_LINK, + elems->ie_start, elems->total_len) { + const struct ieee80211_multi_link_elem *mle = + (void *)tmp->data + 1; + + if (!ieee80211_mle_size_ok(tmp->data + 1, tmp->datalen - 1)) + continue; + + if (le16_get_bits(mle->control, IEEE80211_ML_CONTROL_TYPE) != + IEEE80211_ML_CONTROL_TYPE_BASIC) + continue; + + elems_parse->ml_basic_elem = tmp; + break; + } + ml_len = cfg80211_defragment_element(elems_parse->ml_basic_elem, elems->ie_start, elems->total_len, @@ -891,26 +911,26 @@ static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, WLAN_EID_FRAGMENT); if (ml_len < 0) - return; + return NULL; elems->ml_basic = (const void *)elems_parse->scratch_pos; elems->ml_basic_len = ml_len; elems_parse->scratch_pos += ml_len; if (params->link_id == -1) - return; + return NULL; ieee80211_mle_get_sta_prof(elems_parse, params->link_id); prof = elems->prof; if (!prof) - return; + return NULL; /* check if we have the 4 bytes for the fixed part in assoc response */ if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { elems->prof = NULL; elems->sta_prof_len = 0; - return; + return NULL; } /* @@ -919,13 +939,17 @@ static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, * the -1 is because the 'sta_info_len' is accounted to as part of the * per-STA profile, but not part of the 'u8 variable[]' portion. */ - sub.start = prof->variable + prof->sta_info_len - 1 + 4; + sub->start = prof->variable + prof->sta_info_len - 1 + 4; end = (const u8 *)prof + elems->sta_prof_len; - sub.len = end - sub.start; + sub->len = end - sub->start; - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub.start, sub.len); - _ieee802_11_parse_elems_full(&sub, elems_parse, non_inherit); + sub->mode = params->mode; + sub->action = params->action; + sub->from_ap = params->from_ap; + sub->link_id = -1; + + return cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub->start, sub->len); } static void @@ -973,15 +997,19 @@ ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse) struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) { + struct ieee80211_elems_parse_params sub = {}; struct ieee80211_elems_parse *elems_parse; - struct ieee802_11_elems *elems; const struct element *non_inherit = NULL; - u8 *nontransmitted_profile; - int nontransmitted_profile_len = 0; + struct ieee802_11_elems *elems; size_t scratch_len = 3 * params->len; + bool multi_link_inner = false; BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); + /* cannot parse for both a specific link and non-transmitted BSS */ + if (WARN_ON(params->link_id >= 0 && params->bss)) + return NULL; + elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len), GFP_ATOMIC); if (!elems_parse) @@ -998,34 +1026,51 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) ieee80211_clear_tpe(&elems->tpe); ieee80211_clear_tpe(&elems->csa_tpe); - nontransmitted_profile = elems_parse->scratch_pos; - nontransmitted_profile_len = - ieee802_11_find_bssid_profile(params->start, params->len, - elems, params->bss, - nontransmitted_profile); - elems_parse->scratch_pos += nontransmitted_profile_len; - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - nontransmitted_profile, - nontransmitted_profile_len); + /* + * If we're looking for a non-transmitted BSS then we cannot at + * the same time be looking for a second link as the two can only + * appear in the same frame carrying info for different BSSes. + * + * In any case, we only look for one at a time, as encoded by + * the WARN_ON above. + */ + if (params->bss) { + int nontx_len = + ieee802_11_find_bssid_profile(params->start, + params->len, + elems, params->bss, + elems_parse->scratch_pos); + sub.start = elems_parse->scratch_pos; + sub.mode = params->mode; + sub.len = nontx_len; + sub.action = params->action; + sub.link_id = params->link_id; + + /* consume the space used for non-transmitted profile */ + elems_parse->scratch_pos += nontx_len; + + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub.start, nontx_len); + } else { + /* must always parse to get elems_parse->ml_basic_elem */ + non_inherit = ieee80211_prep_mle_link_parse(elems_parse, params, + &sub); + multi_link_inner = true; + } + elems_parse->skip_vendor = + cfg80211_find_elem(WLAN_EID_VENDOR_SPECIFIC, + sub.start, sub.len); elems->crc = _ieee802_11_parse_elems_full(params, elems_parse, non_inherit); - /* Override with nontransmitted profile, if found */ - if (nontransmitted_profile_len) { - struct ieee80211_elems_parse_params sub = { - .mode = params->mode, - .start = nontransmitted_profile, - .len = nontransmitted_profile_len, - .action = params->action, - .link_id = params->link_id, - }; - + /* Override with nontransmitted/per-STA profile if found */ + if (sub.len) { + elems_parse->multi_link_inner = multi_link_inner; + elems_parse->skip_vendor = false; _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); } - ieee80211_mle_parse_link(elems_parse, params); - ieee80211_mle_defrag_reconf(elems_parse); ieee80211_mle_defrag_epcs(elems_parse); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f6b631faf4f7f..7f02bd5891eb9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -687,7 +687,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { - if (!local->ops->flush) + if (!local->ops->flush && !drop) return; /* @@ -714,7 +714,8 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, } } - drv_flush(local, sdata, queues, drop); + if (local->ops->flush) + drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 21b7c3b280b45..ea1efef3572ae 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -213,8 +213,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) goto out_wq; } - hrtimer_init(&local->ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - local->ifs_timer.function = ieee802154_xmit_ifs_timer; + hrtimer_setup(&local->ifs_timer, ieee802154_xmit_ifs_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); wpan_phy_set_dev(local->phy, local->hw.parent); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 572d160edca33..7868207c4e9d9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -977,7 +977,7 @@ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, - bool needs_id) + bool needs_id, bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; @@ -1017,6 +1017,17 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, if (entry->addr.id) goto out; + /* allow callers that only need to look up the local + * addr's id to skip replacement. This allows them to + * avoid calling synchronize_rcu in the packet recv + * path. + */ + if (!replace) { + kfree(entry); + ret = cur->addr.id; + goto out; + } + pernet->addrs--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); @@ -1165,7 +1176,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc entry->ifindex = 0; entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); if (ret < 0) kfree(entry); @@ -1433,7 +1444,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info)); + !mptcp_pm_has_addr_attr_id(attr, info), + true); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; @@ -1514,11 +1526,6 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, if (mptcp_pm_is_userspace(msk)) goto next; - if (list_empty(&msk->conn_list)) { - mptcp_pm_remove_anno_addr(msk, addr, false); - goto next; - } - lock_sock(sk); remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6a207958459d..ad21925af0612 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1199,6 +1199,8 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } + if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) + return; set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fd021cf8286ef..9f18217dddc86 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1142,7 +1142,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1286,18 +1285,6 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss mptcp_schedule_work(sk); } -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); - - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return READ_ONCE(msk->allow_infinite_fallback); -} - static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1393,7 +1380,7 @@ static bool subflow_check_data_avail(struct sock *ssk) return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1772,10 +1759,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 85311226183a2..a53ea60d0a78d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -795,16 +795,6 @@ static int netlink_release(struct socket *sock) sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); - /* Because struct net might disappear soon, do not keep a pointer. */ - if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - /* Because of deferred_put_nlk_sk and use of work queue, - * it is possible netns will be freed before this socket. - */ - sock_net_set(sk, &init_net); - __netns_tracker_alloc(&init_net, &sk->ns_tracker, - false, GFP_KERNEL); - } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0581c53e65170..3cc2f303bf786 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } - /* Update ns_tracker to current stack trace and refcounted tracker */ - __netns_tracker_free(net, &sk->ns_tracker, false); - - sk->sk_net_refcnt = 1; - netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); + put_net(net); } rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5e740c4862034..a64a0cab1bf7f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -360,7 +360,6 @@ struct rxrpc_peer { u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ unsigned int ackr_max_data; /* Maximum data advertised by peer */ - seqcount_t mtu_lock; /* Lockless MTU access management */ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ unsigned int max_data; /* Maximum packet data capacity for this peer */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9047ba13bd31e..24aceb183c2c3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -810,9 +810,7 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb if (max_mtu < peer->max_data) { trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, rxrpc_pmtud_reduce_ack); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_mtu; - write_seqcount_end(&peer->mtu_lock); } max_data = umin(max_mtu, peer->max_data); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index bc283da9ee402..7f4729234957e 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -130,9 +130,7 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) peer->pmtud_bad = max_data + 1; trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); } } @@ -408,13 +406,8 @@ void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t a } max_data = umin(max_data, peer->ackr_max_data); - if (max_data != peer->max_data) { - preempt_disable(); - write_seqcount_begin(&peer->mtu_lock); + if (max_data != peer->max_data) peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); - preempt_enable(); - } jumbo = max_data + sizeof(struct rxrpc_jumbo_header); jumbo /= RXRPC_JUMBO_SUBPKTLEN; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 0fcc87f0409f9..56e09d161a97f 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -235,7 +235,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; @@ -325,10 +324,10 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(local, peer, hash_key); - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); } /* diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 7ef93407be830..e848a4777b8c7 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -478,6 +478,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 91c0ec729823b..c1f75f2727576 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -287,8 +287,7 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e3e91cf867eb9..6d85b342a8576 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -619,8 +619,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { - hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); - wd->timer.function = qdisc_watchdog; + hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a68e17891b0b5..14021b8123290 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1932,8 +1932,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); @@ -2056,8 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, spin_lock_init(&q->current_entry_lock); - hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ca6984541edbd..3e6cb35baf25a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3337,10 +3337,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) * which need net ref. */ sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); return 0; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index cb279eb9ac4ba..7ce5e28a6c031 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1674,12 +1674,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) } } -#ifdef CONFIG_PROC_FS static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; + if (!IS_ENABLED(CONFIG_PROC_FS)) + return 0; + sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) @@ -1707,12 +1709,6 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) remove_cache_proc_entries(cd); return -ENOMEM; } -#else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) -{ - return 0; -} -#endif void __init cache_initialize(void) { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef623ea15060..9b45fbdc90cab 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cb3bd12f5818b..72e5a01df3d35 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1541,10 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - __netns_tracker_free(net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sock->sk); if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c60936d8cef71..83cc095846d35 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,12 +1941,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } - if (protocol == IPPROTO_TCP) { - __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(xprt->xprt_net, 1); - } + if (protocol == IPPROTO_TCP) + sk_net_refcnt_upgrade(sock->sk); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) @@ -2581,7 +2577,15 @@ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); - lower_transport->xprt_err = status ? -EACCES : 0; + switch (status) { + case 0: + case -EACCES: + case -ETIMEDOUT: + lower_transport->xprt_err = status; + break; + default: + lower_transport->xprt_err = -EACCES; + } complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 34945de1fb1fa..f0e613d976640 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2102,6 +2102,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out_sock_put; } + sock_put(other); goto lookup; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7d3da0f6833d..e87267fbb442e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4220,6 +4220,11 @@ static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) if (flags[flag]) *mntrflags |= (1<queue); - hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); - xtfs->iptfs_timer.function = iptfs_delay_timer; + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); spin_lock_init(&xtfs->drop_lock); - hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); - xtfs->drop_timer.function = iptfs_drop_timer; + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); /* Modify type (esp) adjustment values */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ad2202fa82f34..9bd14fdb67a57 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -746,8 +746,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); INIT_HLIST_NODE(&x->byseq); - hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); - x->mtimer.function = xfrm_timer_handler; + hrtimer_setup(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME, + HRTIMER_MODE_ABS_SOFT); timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 0640b7e115be1..4c1a10a419cf1 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -30,6 +30,7 @@ #include "signal.c" #include "slab.c" #include "spinlock.c" +#include "sync.c" #include "task.c" #include "uaccess.c" #include "vmalloc.c" diff --git a/rust/helpers/sync.c b/rust/helpers/sync.c new file mode 100644 index 0000000000000..ff7e68b488101 --- /dev/null +++ b/rust/helpers/sync.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_lockdep_register_key(struct lock_class_key *k) +{ + lockdep_register_key(k); +} + +void rust_helper_lockdep_unregister_key(struct lock_class_key *k) +{ + lockdep_unregister_key(k); +} diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 3498fb344dc93..4104bc26471a4 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -5,6 +5,8 @@ //! This module contains the kernel APIs related to synchronisation that have been ported or //! wrapped for usage by Rust code in the kernel. +use crate::pin_init; +use crate::prelude::*; use crate::types::Opaque; mod arc; @@ -23,26 +25,64 @@ pub use locked_by::LockedBy; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] -pub struct LockClassKey(Opaque); +#[pin_data(PinnedDrop)] +pub struct LockClassKey { + #[pin] + inner: Opaque, +} // SAFETY: `bindings::lock_class_key` is designed to be used concurrently from multiple threads and // provides its own synchronization. unsafe impl Sync for LockClassKey {} impl LockClassKey { - /// Creates a new lock class key. - pub const fn new() -> Self { - Self(Opaque::uninit()) + /// Initializes a dynamically allocated lock class key. In the common case of using a + /// statically allocated lock class key, the static_lock_class! macro should be used instead. + /// + /// # Example + /// ``` + /// # use kernel::{c_str, stack_pin_init}; + /// # use kernel::alloc::KBox; + /// # use kernel::types::ForeignOwnable; + /// # use kernel::sync::{LockClassKey, SpinLock}; + /// + /// let key = KBox::pin_init(LockClassKey::new_dynamic(), GFP_KERNEL)?; + /// let key_ptr = key.into_foreign(); + /// + /// { + /// stack_pin_init!(let num: SpinLock = SpinLock::new( + /// 0, + /// c_str!("my_spinlock"), + /// // SAFETY: `key_ptr` is returned by the above `into_foreign()`, whose + /// // `from_foreign()` has not yet been called. + /// unsafe { > as ForeignOwnable>::borrow(key_ptr) } + /// )); + /// } + /// + /// // SAFETY: We dropped `num`, the only use of the key, so the result of the previous + /// // `borrow` has also been dropped. Thus, it's safe to use from_foreign. + /// unsafe { drop(> as ForeignOwnable>::from_foreign(key_ptr)) }; + /// + /// # Ok::<(), Error>(()) + /// ``` + pub fn new_dynamic() -> impl PinInit { + pin_init!(Self { + // SAFETY: lockdep_register_key expects an uninitialized block of memory + inner <- Opaque::ffi_init(|slot| unsafe { bindings::lockdep_register_key(slot) }) + }) } pub(crate) fn as_ptr(&self) -> *mut bindings::lock_class_key { - self.0.get() + self.inner.get() } } -impl Default for LockClassKey { - fn default() -> Self { - Self::new() +#[pinned_drop] +impl PinnedDrop for LockClassKey { + fn drop(self: Pin<&mut Self>) { + // SAFETY: self.as_ptr was registered with lockdep and self is pinned, so the address + // hasn't changed. Thus, it's safe to pass to unregister. + unsafe { bindings::lockdep_unregister_key(self.as_ptr()) } } } @@ -51,8 +91,11 @@ impl Default for LockClassKey { #[macro_export] macro_rules! static_lock_class { () => {{ - static CLASS: $crate::sync::LockClassKey = $crate::sync::LockClassKey::new(); - &CLASS + static CLASS: $crate::sync::LockClassKey = + // SAFETY: lockdep expects uninitialized memory when it's handed a statically allocated + // lock_class_key + unsafe { ::core::mem::MaybeUninit::uninit().assume_init() }; + $crate::prelude::Pin::static_ref(&CLASS) }}; } diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 7df565038d7d0..fbf68ada582f6 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -11,12 +11,13 @@ use crate::{ init::PinInit, pin_init, str::CStr, - task::{MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE}, + task::{ + MAX_SCHEDULE_TIMEOUT, TASK_FREEZABLE, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE, + }, time::Jiffies, types::Opaque, }; -use core::marker::PhantomPinned; -use core::ptr; +use core::{marker::PhantomPinned, pin::Pin, ptr}; use macros::pin_data; /// Creates a [`CondVar`] initialiser with the given name and a newly-created lock class. @@ -101,7 +102,7 @@ unsafe impl Sync for CondVar {} impl CondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { _pin: PhantomPinned, // SAFETY: `slot` is valid while the closure is called and both `name` and `key` have @@ -159,6 +160,25 @@ impl CondVar { crate::current!().signal_pending() } + /// Releases the lock and waits for a notification in interruptible and freezable mode. + /// + /// The process is allowed to be frozen during this sleep. No lock should be held when calling + /// this function, and there is a lockdep assertion for this. Freezing a task that holds a lock + /// can trivially deadlock vs another task that needs that lock to complete before it too can + /// hit freezable. + #[must_use = "wait_interruptible_freezable returns if a signal is pending, so the caller must check the return value"] + pub fn wait_interruptible_freezable( + &self, + guard: &mut Guard<'_, T, B>, + ) -> bool { + self.wait_internal( + TASK_INTERRUPTIBLE | TASK_FREEZABLE, + guard, + MAX_SCHEDULE_TIMEOUT, + ); + crate::current!().signal_pending() + } + /// Releases the lock and waits for a notification in interruptible mode. /// /// Atomically releases the given lock (whose ownership is proven by the guard) and puts the diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index eb80048e0110d..360a10a9216d6 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -12,7 +12,7 @@ use crate::{ str::CStr, types::{NotThreadSafe, Opaque, ScopeGuard}, }; -use core::{cell::UnsafeCell, marker::PhantomPinned}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; use macros::pin_data; pub mod mutex; @@ -129,7 +129,7 @@ unsafe impl Sync for Lock {} impl Lock { /// Constructs a new lock initialiser. - pub fn new(t: T, name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(t: T, name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { data: UnsafeCell::new(t), _pin: PhantomPinned, @@ -199,7 +199,36 @@ pub struct Guard<'a, T: ?Sized, B: Backend> { // SAFETY: `Guard` is sync when the data protected by the lock is also sync. unsafe impl Sync for Guard<'_, T, B> {} -impl Guard<'_, T, B> { +impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { + /// Returns the lock that this guard originates from. + /// + /// # Examples + /// + /// The following example shows how to use [`Guard::lock_ref()`] to assert the corresponding + /// lock is held. + /// + /// ``` + /// # use kernel::{new_spinlock, stack_pin_init, sync::lock::{Backend, Guard, Lock}}; + /// + /// fn assert_held(guard: &Guard<'_, T, B>, lock: &Lock) { + /// // Address-equal means the same lock. + /// assert!(core::ptr::eq(guard.lock_ref(), lock)); + /// } + /// + /// // Creates a new lock on the stack. + /// stack_pin_init!{ + /// let l = new_spinlock!(42) + /// } + /// + /// let g = l.lock(); + /// + /// // `g` originates from `l`. + /// assert_held(&g, &l); + /// ``` + pub fn lock_ref(&self) -> &'a Lock { + self.lock + } + pub(crate) fn do_unlocked(&mut self, cb: impl FnOnce() -> U) -> U { // SAFETY: The caller owns the lock, so it is safe to unlock it. unsafe { B::unlock(self.lock.state.get(), &self.state) }; diff --git a/rust/kernel/sync/lock/global.rs b/rust/kernel/sync/lock/global.rs index 480ee724e3cc4..d65f94b5caf26 100644 --- a/rust/kernel/sync/lock/global.rs +++ b/rust/kernel/sync/lock/global.rs @@ -13,6 +13,7 @@ use crate::{ use core::{ cell::UnsafeCell, marker::{PhantomData, PhantomPinned}, + pin::Pin, }; /// Trait implemented for marker types for global locks. @@ -26,7 +27,7 @@ pub trait GlobalLockBackend { /// The backend used for this global lock. type Backend: Backend + 'static; /// The class for this global lock. - fn get_lock_class() -> &'static LockClassKey; + fn get_lock_class() -> Pin<&'static LockClassKey>; } /// Type used for global locks. @@ -270,7 +271,7 @@ macro_rules! global_lock { type Item = $valuety; type Backend = $crate::global_lock_inner!(backend $kind); - fn get_lock_class() -> &'static $crate::sync::LockClassKey { + fn get_lock_class() -> Pin<&'static $crate::sync::LockClassKey> { $crate::static_lock_class!() } } diff --git a/rust/kernel/sync/poll.rs b/rust/kernel/sync/poll.rs index d5f17153b4244..c4934f82d68bd 100644 --- a/rust/kernel/sync/poll.rs +++ b/rust/kernel/sync/poll.rs @@ -89,7 +89,7 @@ pub struct PollCondVar { impl PollCondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit { pin_init!(Self { inner <- CondVar::new(name, key), }) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 07bc22a7645c0..ea43a3b8d9c57 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -23,6 +23,8 @@ pub const MAX_SCHEDULE_TIMEOUT: c_long = c_long::MAX; pub const TASK_INTERRUPTIBLE: c_int = bindings::TASK_INTERRUPTIBLE as c_int; /// Bitmask for tasks that are sleeping in an uninterruptible state. pub const TASK_UNINTERRUPTIBLE: c_int = bindings::TASK_UNINTERRUPTIBLE as c_int; +/// Bitmask for tasks that are sleeping in a freezable state. +pub const TASK_FREEZABLE: c_int = bindings::TASK_FREEZABLE as c_int; /// Convenience constant for waking up tasks regardless of whether they are in interruptible or /// uninterruptible sleep. pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 0cd100d2aefb4..6b6f3ad089513 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -369,7 +369,7 @@ unsafe impl Sync for Work {} impl Work { /// Creates a new instance of [`Work`]. #[inline] - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit where T: WorkItem, { diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh deleted file mode 100755 index 9459ca4f0f11f..0000000000000 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# This requires GCC 8.1 or better. Specifically, we require -# -mstack-protector-guard-reg, added by -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh deleted file mode 100755 index f680bb01aeeb3..0000000000000 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 13eb8b3901b8f..8f7c4fb78c2cc 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -164,7 +164,7 @@ def get_current_task(cpu): var_ptr = gdb.parse_and_eval("(struct task_struct *)cpu_tasks[0].task") return var_ptr.dereference() else: - var_ptr = gdb.parse_and_eval("&pcpu_hot.current_task") + var_ptr = gdb.parse_and_eval("¤t_task") return per_cpu(var_ptr, cpu).dereference() elif utils.is_target_arch("aarch64"): current_task_addr = gdb.parse_and_eval("(unsigned long)$SP_EL0") diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 03852da3d2490..4b0234e4b12f3 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -5,7 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * - * Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S + * Usage: kallsyms [--all-symbols] in.map > out.S * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might @@ -37,7 +37,6 @@ struct sym_entry { unsigned long long addr; unsigned int len; unsigned int seq; - bool percpu_absolute; unsigned char sym[]; }; @@ -55,14 +54,9 @@ static struct addr_range text_ranges[] = { #define text_range_text (&text_ranges[0]) #define text_range_inittext (&text_ranges[1]) -static struct addr_range percpu_range = { - "__per_cpu_start", "__per_cpu_end", -1ULL, 0 -}; - static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; -static int absolute_percpu; static int token_profit[0x10000]; @@ -73,7 +67,7 @@ static unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] in.map > out.S\n"); exit(1); } @@ -164,7 +158,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) return NULL; check_symbol_range(name, addr, text_ranges, ARRAY_SIZE(text_ranges)); - check_symbol_range(name, addr, &percpu_range, 1); /* include the type field in the symbol name, so that it gets * compressed together */ @@ -175,7 +168,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) sym->len = len; sym->sym[0] = type; strcpy(sym_name(sym), name); - sym->percpu_absolute = false; return sym; } @@ -319,11 +311,6 @@ static int expand_symbol(const unsigned char *data, int len, char *result) return total; } -static bool symbol_absolute(const struct sym_entry *s) -{ - return s->percpu_absolute; -} - static int compare_names(const void *a, const void *b) { int ret; @@ -455,22 +442,11 @@ static void write_src(void) */ long long offset; - bool overflow; - - if (!absolute_percpu) { - offset = table[i]->addr - relative_base; - overflow = offset < 0 || offset > UINT_MAX; - } else if (symbol_absolute(table[i])) { - offset = table[i]->addr; - overflow = offset < 0 || offset > INT_MAX; - } else { - offset = relative_base - table[i]->addr - 1; - overflow = offset < INT_MIN || offset >= 0; - } - if (overflow) { + + offset = table[i]->addr - relative_base; + if (offset < 0 || offset > UINT_MAX) { fprintf(stderr, "kallsyms failure: " - "%s symbol value %#llx out of range in relative mode\n", - symbol_absolute(table[i]) ? "absolute" : "relative", + "relative symbol value %#llx out of range\n", table[i]->addr); exit(EXIT_FAILURE); } @@ -725,36 +701,15 @@ static void sort_symbols(void) qsort(table, table_cnt, sizeof(table[0]), compare_symbols); } -static void make_percpus_absolute(void) -{ - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (symbol_in_range(table[i], &percpu_range, 1)) { - /* - * Keep the 'A' override for percpu symbols to - * ensure consistent behavior compared to older - * versions of this tool. - */ - table[i]->sym[0] = 'A'; - table[i]->percpu_absolute = true; - } -} - /* find the minimum non-absolute symbol address */ static void record_relative_base(void) { - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (!symbol_absolute(table[i])) { - /* - * The table is sorted by address. - * Take the first non-absolute symbol value. - */ - relative_base = table[i]->addr; - return; - } + /* + * The table is sorted by address. + * Take the first symbol value. + */ + if (table_cnt) + relative_base = table[0]->addr; } int main(int argc, char **argv) @@ -762,7 +717,6 @@ int main(int argc, char **argv) while (1) { static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, - {"absolute-percpu", no_argument, &absolute_percpu, 1}, {}, }; @@ -779,8 +733,6 @@ int main(int argc, char **argv) read_map(argv[optind]); shrink_table(); - if (absolute_percpu) - make_percpus_absolute(); sort_symbols(); record_relative_base(); optimize_token_table(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 56a077d204cfa..67e66333bd2a3 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -144,10 +144,6 @@ kallsyms() kallsymopt="${kallsymopt} --all-symbols" fi - if is_enabled CONFIG_KALLSYMS_ABSOLUTE_PERCPU; then - kallsymopt="${kallsymopt} --absolute-percpu" - fi - info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 91c91201212c4..787868183b843 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -19,12 +19,14 @@ binutils) gcc) if [ "$ARCH" = parisc64 ]; then echo 12.0.0 + elif [ "$SRCARCH" = x86 ]; then + echo 8.1.0 else echo 5.1.0 fi ;; llvm) - if [ "$SRCARCH" = s390 ]; then + if [ "$SRCARCH" = s390 -o "$SRCARCH" = x86 ]; then echo 15.0.0 elif [ "$SRCARCH" = loongarch ]; then echo 18.0.0 diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index 2966473b46609..b96538787f3d9 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -63,7 +63,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then # Clear VPATH and srcroot because the source files reside in the output # directory. # shellcheck disable=SC2016 # $(MAKE) and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC='"${CC}"' VPATH= srcroot=. $(build)='"${destdir}"/scripts + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC='"${CC}"' VPATH= srcroot=. $(build)='"$(realpath --relative-base=. "${destdir}")"/scripts rm -f "${destdir}/scripts/Kbuild" fi diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 7c06ffd633d24..a5e730ffda57f 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -180,7 +180,7 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode, } /* - * Dump large security xattr values as a continuous ascii hexademical string. + * Dump large security xattr values as a continuous ascii hexadecimal string. * (pr_debug is limited to 64 bytes.) */ static void dump_security_xattr_l(const char *prefix, const void *src, diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 377e57e9084f0..0add782e73ba2 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -169,7 +169,7 @@ static int is_unsupported_hmac_fs(struct dentry *dentry) * and compare it against the stored security.evm xattr. * * For performance: - * - use the previoulsy retrieved xattr value and length to calculate the + * - use the previously retrieved xattr value and length to calculate the * HMAC.) * - cache the verification result in the iint, when available. * diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 24d09ea91b877..a4f284bd846c1 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -149,6 +149,9 @@ struct ima_kexec_hdr { #define IMA_CHECK_BLACKLIST 0x40000000 #define IMA_VERITY_REQUIRED 0x80000000 +/* Exclude non-action flags which are not rule-specific. */ +#define IMA_NONACTION_RULE_FLAGS (IMA_NONACTION_FLAGS & ~IMA_NEW_FILE) + #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) #define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \ diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 9f9897a7c217e..28b8b0db6f9bb 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -269,10 +269,13 @@ static int process_measurement(struct file *file, const struct cred *cred, mutex_lock(&iint->mutex); if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags)) - /* reset appraisal flags if ima_inode_post_setattr was called */ + /* + * Reset appraisal flags (action and non-action rule-specific) + * if ima_inode_post_setattr was called. + */ iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED | IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK | - IMA_NONACTION_FLAGS); + IMA_NONACTION_RULE_FLAGS); /* * Re-evaulate the file if either the xattr has changed or the @@ -1011,9 +1014,9 @@ int process_buffer_measurement(struct mnt_idmap *idmap, } /* - * Both LSM hooks and auxilary based buffer measurements are - * based on policy. To avoid code duplication, differentiate - * between the LSM hooks and auxilary buffer measurements, + * Both LSM hooks and auxiliary based buffer measurements are + * based on policy. To avoid code duplication, differentiate + * between the LSM hooks and auxiliary buffer measurements, * retrieving the policy rule information only for the LSM hook * buffer measurements. */ diff --git a/security/landlock/net.c b/security/landlock/net.c index d5dcc4407a197..104b6c01fe503 100644 --- a/security/landlock/net.c +++ b/security/landlock/net.c @@ -63,8 +63,7 @@ static int current_check_access_socket(struct socket *const sock, if (WARN_ON_ONCE(dom->num_layers < 1)) return -EACCES; - /* Checks if it's a (potential) TCP socket. */ - if (sock->type != SOCK_STREAM) + if (!sk_is_tcp(sock->sk)) return 0; /* Checks for minimal header length to safely read sa_family. */ diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c index 241ce44375b6a..bff4e40a3093c 100644 --- a/security/landlock/ruleset.c +++ b/security/landlock/ruleset.c @@ -124,7 +124,7 @@ create_rule(const struct landlock_id id, return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { - /* This should be catched by insert_rule(). */ + /* This should have been caught by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index cb66ec42a3f8a..706f53e39b53c 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -106,7 +106,7 @@ static struct snd_seq_client *clientptr(int clientid) return clienttab[clientid]; } -struct snd_seq_client *snd_seq_client_use_ptr(int clientid) +static struct snd_seq_client *client_use_ptr(int clientid, bool load_module) { unsigned long flags; struct snd_seq_client *client; @@ -126,7 +126,7 @@ struct snd_seq_client *snd_seq_client_use_ptr(int clientid) } spin_unlock_irqrestore(&clients_lock, flags); #ifdef CONFIG_MODULES - if (!in_interrupt()) { + if (load_module) { static DECLARE_BITMAP(client_requested, SNDRV_SEQ_GLOBAL_CLIENTS); static DECLARE_BITMAP(card_requested, SNDRV_CARDS); @@ -168,6 +168,20 @@ struct snd_seq_client *snd_seq_client_use_ptr(int clientid) return client; } +/* get snd_seq_client object for the given id quickly */ +struct snd_seq_client *snd_seq_client_use_ptr(int clientid) +{ + return client_use_ptr(clientid, false); +} + +/* get snd_seq_client object for the given id; + * if not found, retry after loading the modules + */ +static struct snd_seq_client *client_load_and_use_ptr(int clientid) +{ + return client_use_ptr(clientid, IS_ENABLED(CONFIG_MODULES)); +} + /* Take refcount and perform ioctl_mutex lock on the given client; * used only for OSS sequencer * Unlock via snd_seq_client_ioctl_unlock() below @@ -176,7 +190,7 @@ bool snd_seq_client_ioctl_lock(int clientid) { struct snd_seq_client *client; - client = snd_seq_client_use_ptr(clientid); + client = client_load_and_use_ptr(clientid); if (!client) return false; mutex_lock(&client->ioctl_mutex); @@ -1195,7 +1209,7 @@ static int snd_seq_ioctl_running_mode(struct snd_seq_client *client, void *arg) int err = 0; /* requested client number */ - cptr = snd_seq_client_use_ptr(info->client); + cptr = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ @@ -1257,7 +1271,7 @@ static int snd_seq_ioctl_get_client_info(struct snd_seq_client *client, struct snd_seq_client *cptr; /* requested client number */ - cptr = snd_seq_client_use_ptr(client_info->client); + cptr = client_load_and_use_ptr(client_info->client); if (cptr == NULL) return -ENOENT; /* don't change !!! */ @@ -1396,7 +1410,7 @@ static int snd_seq_ioctl_get_port_info(struct snd_seq_client *client, void *arg) struct snd_seq_client *cptr; struct snd_seq_client_port *port; - cptr = snd_seq_client_use_ptr(info->addr.client); + cptr = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; @@ -1503,10 +1517,10 @@ static int snd_seq_ioctl_subscribe_port(struct snd_seq_client *client, struct snd_seq_client *receiver = NULL, *sender = NULL; struct snd_seq_client_port *sport = NULL, *dport = NULL; - receiver = snd_seq_client_use_ptr(subs->dest.client); + receiver = client_load_and_use_ptr(subs->dest.client); if (!receiver) goto __end; - sender = snd_seq_client_use_ptr(subs->sender.client); + sender = client_load_and_use_ptr(subs->sender.client); if (!sender) goto __end; sport = snd_seq_port_use_ptr(sender, subs->sender.port); @@ -1871,7 +1885,7 @@ static int snd_seq_ioctl_get_client_pool(struct snd_seq_client *client, struct snd_seq_client_pool *info = arg; struct snd_seq_client *cptr; - cptr = snd_seq_client_use_ptr(info->client); + cptr = client_load_and_use_ptr(info->client); if (cptr == NULL) return -ENOENT; memset(info, 0, sizeof(*info)); @@ -1975,7 +1989,7 @@ static int snd_seq_ioctl_get_subscription(struct snd_seq_client *client, struct snd_seq_client_port *sport = NULL; result = -EINVAL; - sender = snd_seq_client_use_ptr(subs->sender.client); + sender = client_load_and_use_ptr(subs->sender.client); if (!sender) goto __end; sport = snd_seq_port_use_ptr(sender, subs->sender.port); @@ -2006,7 +2020,7 @@ static int snd_seq_ioctl_query_subs(struct snd_seq_client *client, void *arg) struct list_head *p; int i; - cptr = snd_seq_client_use_ptr(subs->root.client); + cptr = client_load_and_use_ptr(subs->root.client); if (!cptr) goto __end; port = snd_seq_port_use_ptr(cptr, subs->root.port); @@ -2073,7 +2087,7 @@ static int snd_seq_ioctl_query_next_client(struct snd_seq_client *client, if (info->client < 0) info->client = 0; for (; info->client < SNDRV_SEQ_MAX_CLIENTS; info->client++) { - cptr = snd_seq_client_use_ptr(info->client); + cptr = client_load_and_use_ptr(info->client); if (cptr) break; /* found */ } @@ -2096,7 +2110,7 @@ static int snd_seq_ioctl_query_next_port(struct snd_seq_client *client, struct snd_seq_client *cptr; struct snd_seq_client_port *port = NULL; - cptr = snd_seq_client_use_ptr(info->addr.client); + cptr = client_load_and_use_ptr(info->addr.client); if (cptr == NULL) return -ENXIO; @@ -2193,7 +2207,7 @@ static int snd_seq_ioctl_client_ump_info(struct snd_seq_client *caller, size = sizeof(struct snd_ump_endpoint_info); else size = sizeof(struct snd_ump_block_info); - cptr = snd_seq_client_use_ptr(client); + cptr = client_load_and_use_ptr(client); if (!cptr) return -ENOENT; @@ -2475,7 +2489,7 @@ int snd_seq_kernel_client_enqueue(int client, struct snd_seq_event *ev, if (check_event_type_and_length(ev)) return -EINVAL; - cptr = snd_seq_client_use_ptr(client); + cptr = client_load_and_use_ptr(client); if (cptr == NULL) return -EINVAL; @@ -2707,7 +2721,7 @@ void snd_seq_info_clients_read(struct snd_info_entry *entry, /* list the client table */ for (c = 0; c < SNDRV_SEQ_MAX_CLIENTS; c++) { - client = snd_seq_client_use_ptr(c); + client = client_load_and_use_ptr(c); if (client == NULL) continue; if (client->type == NO_CLIENT) { diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig index e393578cbe684..84ebf19f28836 100644 --- a/sound/pci/hda/Kconfig +++ b/sound/pci/hda/Kconfig @@ -222,6 +222,7 @@ comment "Set to Y if you want auto-loading the side codec driver" config SND_HDA_CODEC_REALTEK tristate "Build Realtek HD-audio codec support" + depends on INPUT select SND_HDA_GENERIC select SND_HDA_GENERIC_LEDS select SND_HDA_SCODEC_COMPONENT diff --git a/sound/pci/hda/cs35l56_hda_spi.c b/sound/pci/hda/cs35l56_hda_spi.c index d4ee5bb7c4866..9035784669053 100644 --- a/sound/pci/hda/cs35l56_hda_spi.c +++ b/sound/pci/hda/cs35l56_hda_spi.c @@ -22,6 +22,9 @@ static int cs35l56_hda_spi_probe(struct spi_device *spi) return -ENOMEM; cs35l56->base.dev = &spi->dev; + ret = cs35l56_init_config_for_spi(&cs35l56->base, spi); + if (ret) + return ret; #ifdef CS35L56_WAKE_HOLD_TIME_US cs35l56->base.can_hibernate = true; diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 67540e0373099..e67c22c59f02b 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2232,6 +2232,8 @@ static const struct snd_pci_quirk power_save_denylist[] = { SND_PCI_QUIRK(0x1631, 0xe017, "Packard Bell NEC IMEDIA 5204", 0), /* KONTRON SinglePC may cause a stall at runtime resume */ SND_PCI_QUIRK(0x1734, 0x1232, "KONTRON SinglePC", 0), + /* Dell ALC3271 */ + SND_PCI_QUIRK(0x1028, 0x0962, "Dell ALC3271", 0), {} }; diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 224616fbec4fa..d2a1f836dbbf7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -3843,6 +3843,79 @@ static void alc225_shutup(struct hda_codec *codec) } } +static void alc222_init(struct hda_codec *codec) +{ + struct alc_spec *spec = codec->spec; + hda_nid_t hp_pin = alc_get_hp_pin(spec); + bool hp1_pin_sense, hp2_pin_sense; + + if (!hp_pin) + return; + + msleep(30); + + hp1_pin_sense = snd_hda_jack_detect(codec, hp_pin); + hp2_pin_sense = snd_hda_jack_detect(codec, 0x14); + + if (hp1_pin_sense || hp2_pin_sense) { + msleep(2); + + if (hp1_pin_sense) + snd_hda_codec_write(codec, hp_pin, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT); + if (hp2_pin_sense) + snd_hda_codec_write(codec, 0x14, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT); + msleep(75); + + if (hp1_pin_sense) + snd_hda_codec_write(codec, hp_pin, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); + if (hp2_pin_sense) + snd_hda_codec_write(codec, 0x14, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE); + + msleep(75); + } +} + +static void alc222_shutup(struct hda_codec *codec) +{ + struct alc_spec *spec = codec->spec; + hda_nid_t hp_pin = alc_get_hp_pin(spec); + bool hp1_pin_sense, hp2_pin_sense; + + if (!hp_pin) + hp_pin = 0x21; + + hp1_pin_sense = snd_hda_jack_detect(codec, hp_pin); + hp2_pin_sense = snd_hda_jack_detect(codec, 0x14); + + if (hp1_pin_sense || hp2_pin_sense) { + msleep(2); + + if (hp1_pin_sense) + snd_hda_codec_write(codec, hp_pin, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); + if (hp2_pin_sense) + snd_hda_codec_write(codec, 0x14, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); + + msleep(75); + + if (hp1_pin_sense) + snd_hda_codec_write(codec, hp_pin, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0); + if (hp2_pin_sense) + snd_hda_codec_write(codec, 0x14, 0, + AC_VERB_SET_PIN_WIDGET_CONTROL, 0x0); + + msleep(75); + } + alc_auto_setup_eapd(codec, false); + alc_shutup_pins(codec); +} + static void alc_default_init(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -4927,7 +5000,6 @@ static void alc298_fixup_samsung_amp_v2_4_amps(struct hda_codec *codec, alc298_samsung_v2_init_amps(codec, 4); } -#if IS_REACHABLE(CONFIG_INPUT) static void gpio2_mic_hotkey_event(struct hda_codec *codec, struct hda_jack_callback *event) { @@ -5036,10 +5108,6 @@ static void alc233_fixup_lenovo_line2_mic_hotkey(struct hda_codec *codec, spec->kb_dev = NULL; } } -#else /* INPUT */ -#define alc280_fixup_hp_gpio2_mic_hotkey NULL -#define alc233_fixup_lenovo_line2_mic_hotkey NULL -#endif /* INPUT */ static void alc269_fixup_hp_line1_mic1_led(struct hda_codec *codec, const struct hda_fixup *fix, int action) @@ -5053,6 +5121,16 @@ static void alc269_fixup_hp_line1_mic1_led(struct hda_codec *codec, } } +static void alc233_fixup_lenovo_low_en_micmute_led(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PRE_PROBE) + spec->micmute_led_polarity = 1; + alc233_fixup_lenovo_line2_mic_hotkey(codec, fix, action); +} + static void alc_hp_mute_disable(struct hda_codec *codec, unsigned int delay) { if (delay <= 0) @@ -7621,6 +7699,7 @@ enum { ALC275_FIXUP_DELL_XPS, ALC293_FIXUP_LENOVO_SPK_NOISE, ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY, + ALC233_FIXUP_LENOVO_L2MH_LOW_ENLED, ALC255_FIXUP_DELL_SPK_NOISE, ALC225_FIXUP_DISABLE_MIC_VREF, ALC225_FIXUP_DELL1_MIC_NO_PRESENCE, @@ -7690,7 +7769,6 @@ enum { ALC285_FIXUP_THINKPAD_X1_GEN7, ALC285_FIXUP_THINKPAD_HEADSET_JACK, ALC294_FIXUP_ASUS_ALLY, - ALC294_FIXUP_ASUS_ALLY_X, ALC294_FIXUP_ASUS_ALLY_PINS, ALC294_FIXUP_ASUS_ALLY_VERBS, ALC294_FIXUP_ASUS_ALLY_SPEAKER, @@ -8616,6 +8694,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc233_fixup_lenovo_line2_mic_hotkey, }, + [ALC233_FIXUP_LENOVO_L2MH_LOW_ENLED] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc233_fixup_lenovo_low_en_micmute_led, + }, [ALC233_FIXUP_INTEL_NUC8_DMIC] = { .type = HDA_FIXUP_FUNC, .v.func = alc_fixup_inv_dmic, @@ -9138,12 +9220,6 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC294_FIXUP_ASUS_ALLY_PINS }, - [ALC294_FIXUP_ASUS_ALLY_X] = { - .type = HDA_FIXUP_FUNC, - .v.func = tas2781_fixup_i2c, - .chained = true, - .chain_id = ALC294_FIXUP_ASUS_ALLY_PINS - }, [ALC294_FIXUP_ASUS_ALLY_PINS] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -10600,7 +10676,9 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8e1a, "HP ZBook Firefly 14 G12A", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), + SND_PCI_QUIRK(0x1043, 0x1054, "ASUS G614FH/FM/FP", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x1043, 0x1074, "ASUS G614PH/PM/PP", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x10a1, "ASUS UX391UA", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x10a4, "ASUS TP3407SA", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x10c0, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), @@ -10608,21 +10686,25 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x10d3, "ASUS K6500ZC", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x1154, "ASUS TP3607SH", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x1043, 0x1194, "ASUS UM3406KA", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x11c0, "ASUS X556UR", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1204, "ASUS Strix G615JHR_JMR_JPR", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x1214, "ASUS Strix G615LH_LM_LP", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x125e, "ASUS Q524UQK", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1271, "ASUS X430UN", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1290, "ASUS X441SA", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1294, "ASUS B3405CVA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x12a0, "ASUS X441UV", ALC233_FIXUP_EAPD_COEF_AND_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x12a3, "Asus N7691ZM", ALC269_FIXUP_ASUS_N7601ZM), SND_PCI_QUIRK(0x1043, 0x12af, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x12b4, "ASUS B3405CCA / P3405CCA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x12e0, "ASUS X541SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x12f0, "ASUS X541UV", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x1313, "Asus K42JZ", ALC269VB_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x13b0, "ASUS Z550SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1433, "ASUS GX650PY/PZ/PV/PU/PYV/PZV/PIV/PVV", ALC285_FIXUP_ASUS_I2C_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x1460, "Asus VivoBook 15", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1463, "Asus GA402X/GA402N", ALC285_FIXUP_ASUS_I2C_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1473, "ASUS GU604VI/VC/VE/VG/VJ/VQ/VU/VV/VY/VZ", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1483, "ASUS GU603VQ/VU/VV/VJ/VI", ALC285_FIXUP_ASUS_HEADSET_MIC), @@ -10644,7 +10726,6 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1740, "ASUS UX430UA", ALC295_FIXUP_ASUS_DACS), SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x17f3, "ROG Ally NR2301L/X", ALC294_FIXUP_ASUS_ALLY), - SND_PCI_QUIRK(0x1043, 0x1eb3, "ROG Ally X RC72LA", ALC294_FIXUP_ASUS_ALLY_X), SND_PCI_QUIRK(0x1043, 0x1863, "ASUS UX6404VI/VV", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS), SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), @@ -10656,7 +10737,6 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x19ce, "ASUS B9450FA", ALC294_FIXUP_ASUS_HPE), SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW), - SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x1a63, "ASUS UX3405MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), @@ -10699,14 +10779,28 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1f12, "ASUS UM5302", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1f1f, "ASUS H7604JI/JV/J3D", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1f62, "ASUS UX7602ZM", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1f63, "ASUS P5405CSA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1f92, "ASUS ROG Flow X16", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1fb3, "ASUS ROG Flow Z13 GZ302EA", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x3011, "ASUS B5605CVA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2), + SND_PCI_QUIRK(0x1043, 0x3061, "ASUS B3405CCA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3071, "ASUS B5405CCA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x30c1, "ASUS B3605CCA / P3605CCA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x30d1, "ASUS B5405CCA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x30e1, "ASUS B5605CCA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x31d0, "ASUS Zen AIO 27 Z272SD_A272SD", ALC274_FIXUP_ASUS_ZEN_AIO_27), + SND_PCI_QUIRK(0x1043, 0x31e1, "ASUS B5605CCA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x31f1, "ASUS B3605CCA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS), + SND_PCI_QUIRK(0x1043, 0x3d78, "ASUS GA603KH", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x3d88, "ASUS GA603KM", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x3e00, "ASUS G814FH/FM/FP", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x3e20, "ASUS G814PH/PM/PP", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x3e30, "ASUS TP3607SA", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x3ee0, "ASUS Strix G815_JHR_JMR_JPR", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x3ef0, "ASUS Strix G635LR_LW_LX", ALC287_FIXUP_TAS2781_I2C), @@ -10714,6 +10808,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x3f10, "ASUS Strix G835LR_LW_LX", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x3f20, "ASUS Strix G615LR_LW", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x3f30, "ASUS Strix G815LR_LW", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x1043, 0x3fd0, "ASUS B3605CVA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x3ff0, "ASUS B5405CVA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x1043, 0x8398, "ASUS P1005", ALC269_FIXUP_STEREO_DMIC), @@ -10912,6 +11008,9 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3178, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x31af, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340), SND_PCI_QUIRK(0x17aa, 0x334b, "Lenovo ThinkCentre M70 Gen5", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x17aa, 0x3384, "ThinkCentre M90a PRO", ALC233_FIXUP_LENOVO_L2MH_LOW_ENLED), + SND_PCI_QUIRK(0x17aa, 0x3386, "ThinkCentre M90a Gen6", ALC233_FIXUP_LENOVO_L2MH_LOW_ENLED), + SND_PCI_QUIRK(0x17aa, 0x3387, "ThinkCentre M70a Gen6", ALC233_FIXUP_LENOVO_L2MH_LOW_ENLED), SND_PCI_QUIRK(0x17aa, 0x3801, "Lenovo Yoga9 14IAP7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), HDA_CODEC_QUIRK(0x17aa, 0x3802, "DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga Pro 9 14IRP8", ALC287_FIXUP_TAS2781_I2C), @@ -11900,8 +11999,11 @@ static int patch_alc269(struct hda_codec *codec) spec->codec_variant = ALC269_TYPE_ALC300; spec->gen.mixer_nid = 0; /* no loopback on ALC300 */ break; + case 0x10ec0222: case 0x10ec0623: spec->codec_variant = ALC269_TYPE_ALC623; + spec->shutup = alc222_shutup; + spec->init_hook = alc222_init; break; case 0x10ec0700: case 0x10ec0701: diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index e0ed4fc11155a..e28bfefa72f33 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -303,6 +304,79 @@ void cs35l56_wait_min_reset_pulse(void) } EXPORT_SYMBOL_NS_GPL(cs35l56_wait_min_reset_pulse, "SND_SOC_CS35L56_SHARED"); +static const struct { + u32 addr; + u32 value; +} cs35l56_spi_system_reset_stages[] = { + { .addr = CS35L56_DSP_VIRTUAL1_MBOX_1, .value = CS35L56_MBOX_CMD_SYSTEM_RESET }, + /* The next write is necessary to delimit the soft reset */ + { .addr = CS35L56_DSP_MBOX_1_RAW, .value = CS35L56_MBOX_CMD_PING }, +}; + +static void cs35l56_spi_issue_bus_locked_reset(struct cs35l56_base *cs35l56_base, + struct spi_device *spi) +{ + struct cs35l56_spi_payload *buf = cs35l56_base->spi_payload_buf; + struct spi_transfer t = { + .tx_buf = buf, + .len = sizeof(*buf), + }; + struct spi_message m; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(cs35l56_spi_system_reset_stages); i++) { + buf->addr = cpu_to_be32(cs35l56_spi_system_reset_stages[i].addr); + buf->value = cpu_to_be32(cs35l56_spi_system_reset_stages[i].value); + spi_message_init_with_transfers(&m, &t, 1); + ret = spi_sync_locked(spi, &m); + if (ret) + dev_warn(cs35l56_base->dev, "spi_sync failed: %d\n", ret); + + usleep_range(CS35L56_SPI_RESET_TO_PORT_READY_US, + 2 * CS35L56_SPI_RESET_TO_PORT_READY_US); + } +} + +static void cs35l56_spi_system_reset(struct cs35l56_base *cs35l56_base) +{ + struct spi_device *spi = to_spi_device(cs35l56_base->dev); + unsigned int val; + int read_ret, ret; + + /* + * There must not be any other SPI bus activity while the amp is + * soft-resetting. + */ + ret = spi_bus_lock(spi->controller); + if (ret) { + dev_warn(cs35l56_base->dev, "spi_bus_lock failed: %d\n", ret); + return; + } + + cs35l56_spi_issue_bus_locked_reset(cs35l56_base, spi); + spi_bus_unlock(spi->controller); + + /* + * Check firmware boot by testing for a response in MBOX_2. + * HALO_STATE cannot be trusted yet because the reset sequence + * can leave it with stale state. But MBOX is reset. + * The regmap must remain in cache-only until the chip has + * booted, so use a bypassed read. + */ + ret = read_poll_timeout(regmap_read_bypassed, read_ret, + (val > 0) && (val < 0xffffffff), + CS35L56_HALO_STATE_POLL_US, + CS35L56_HALO_STATE_TIMEOUT_US, + false, + cs35l56_base->regmap, + CS35L56_DSP_VIRTUAL1_MBOX_2, + &val); + if (ret) { + dev_err(cs35l56_base->dev, "SPI reboot timed out(%d): MBOX2=%#x\n", + read_ret, val); + } +} + static const struct reg_sequence cs35l56_system_reset_seq[] = { REG_SEQ0(CS35L56_DSP1_HALO_STATE, 0), REG_SEQ0(CS35L56_DSP_VIRTUAL1_MBOX_1, CS35L56_MBOX_CMD_SYSTEM_RESET), @@ -315,6 +389,12 @@ void cs35l56_system_reset(struct cs35l56_base *cs35l56_base, bool is_soundwire) * accesses other than the controlled system reset sequence below. */ regcache_cache_only(cs35l56_base->regmap, true); + + if (cs35l56_is_spi(cs35l56_base)) { + cs35l56_spi_system_reset(cs35l56_base); + return; + } + regmap_multi_reg_write_bypassed(cs35l56_base->regmap, cs35l56_system_reset_seq, ARRAY_SIZE(cs35l56_system_reset_seq)); diff --git a/sound/soc/codecs/cs35l56-spi.c b/sound/soc/codecs/cs35l56-spi.c index c101134e85328..ca6c03a8766d3 100644 --- a/sound/soc/codecs/cs35l56-spi.c +++ b/sound/soc/codecs/cs35l56-spi.c @@ -33,6 +33,9 @@ static int cs35l56_spi_probe(struct spi_device *spi) cs35l56->base.dev = &spi->dev; cs35l56->base.can_hibernate = true; + ret = cs35l56_init_config_for_spi(&cs35l56->base, spi); + if (ret) + return ret; ret = cs35l56_common_probe(cs35l56); if (ret != 0) diff --git a/sound/soc/codecs/es8328.c b/sound/soc/codecs/es8328.c index f3c97da798dc8..76159c45e6b52 100644 --- a/sound/soc/codecs/es8328.c +++ b/sound/soc/codecs/es8328.c @@ -233,7 +233,6 @@ static const struct snd_kcontrol_new es8328_right_line_controls = /* Left Mixer */ static const struct snd_kcontrol_new es8328_left_mixer_controls[] = { - SOC_DAPM_SINGLE("Playback Switch", ES8328_DACCONTROL17, 7, 1, 0), SOC_DAPM_SINGLE("Left Bypass Switch", ES8328_DACCONTROL17, 6, 1, 0), SOC_DAPM_SINGLE("Right Playback Switch", ES8328_DACCONTROL18, 7, 1, 0), SOC_DAPM_SINGLE("Right Bypass Switch", ES8328_DACCONTROL18, 6, 1, 0), @@ -243,7 +242,6 @@ static const struct snd_kcontrol_new es8328_left_mixer_controls[] = { static const struct snd_kcontrol_new es8328_right_mixer_controls[] = { SOC_DAPM_SINGLE("Left Playback Switch", ES8328_DACCONTROL19, 7, 1, 0), SOC_DAPM_SINGLE("Left Bypass Switch", ES8328_DACCONTROL19, 6, 1, 0), - SOC_DAPM_SINGLE("Playback Switch", ES8328_DACCONTROL20, 7, 1, 0), SOC_DAPM_SINGLE("Right Bypass Switch", ES8328_DACCONTROL20, 6, 1, 0), }; @@ -336,10 +334,10 @@ static const struct snd_soc_dapm_widget es8328_dapm_widgets[] = { SND_SOC_DAPM_DAC("Left DAC", "Left Playback", ES8328_DACPOWER, ES8328_DACPOWER_LDAC_OFF, 1), - SND_SOC_DAPM_MIXER("Left Mixer", SND_SOC_NOPM, 0, 0, + SND_SOC_DAPM_MIXER("Left Mixer", ES8328_DACCONTROL17, 7, 0, &es8328_left_mixer_controls[0], ARRAY_SIZE(es8328_left_mixer_controls)), - SND_SOC_DAPM_MIXER("Right Mixer", SND_SOC_NOPM, 0, 0, + SND_SOC_DAPM_MIXER("Right Mixer", ES8328_DACCONTROL20, 7, 0, &es8328_right_mixer_controls[0], ARRAY_SIZE(es8328_right_mixer_controls)), @@ -418,19 +416,14 @@ static const struct snd_soc_dapm_route es8328_dapm_routes[] = { { "Right Line Mux", "PGA", "Right PGA Mux" }, { "Right Line Mux", "Differential", "Differential Mux" }, - { "Left Out 1", NULL, "Left DAC" }, - { "Right Out 1", NULL, "Right DAC" }, - { "Left Out 2", NULL, "Left DAC" }, - { "Right Out 2", NULL, "Right DAC" }, - - { "Left Mixer", "Playback Switch", "Left DAC" }, + { "Left Mixer", NULL, "Left DAC" }, { "Left Mixer", "Left Bypass Switch", "Left Line Mux" }, { "Left Mixer", "Right Playback Switch", "Right DAC" }, { "Left Mixer", "Right Bypass Switch", "Right Line Mux" }, { "Right Mixer", "Left Playback Switch", "Left DAC" }, { "Right Mixer", "Left Bypass Switch", "Left Line Mux" }, - { "Right Mixer", "Playback Switch", "Right DAC" }, + { "Right Mixer", NULL, "Right DAC" }, { "Right Mixer", "Right Bypass Switch", "Right Line Mux" }, { "DAC DIG", NULL, "DAC STM" }, diff --git a/sound/soc/codecs/tas2764.c b/sound/soc/codecs/tas2764.c index d482cd194c08c..58315eab492a1 100644 --- a/sound/soc/codecs/tas2764.c +++ b/sound/soc/codecs/tas2764.c @@ -365,7 +365,7 @@ static int tas2764_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) { struct snd_soc_component *component = dai->component; struct tas2764_priv *tas2764 = snd_soc_component_get_drvdata(component); - u8 tdm_rx_start_slot = 0, asi_cfg_0 = 0, asi_cfg_1 = 0; + u8 tdm_rx_start_slot = 0, asi_cfg_0 = 0, asi_cfg_1 = 0, asi_cfg_4 = 0; int ret; switch (fmt & SND_SOC_DAIFMT_INV_MASK) { @@ -374,12 +374,14 @@ static int tas2764_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) fallthrough; case SND_SOC_DAIFMT_NB_NF: asi_cfg_1 = TAS2764_TDM_CFG1_RX_RISING; + asi_cfg_4 = TAS2764_TDM_CFG4_TX_FALLING; break; case SND_SOC_DAIFMT_IB_IF: asi_cfg_0 ^= TAS2764_TDM_CFG0_FRAME_START; fallthrough; case SND_SOC_DAIFMT_IB_NF: asi_cfg_1 = TAS2764_TDM_CFG1_RX_FALLING; + asi_cfg_4 = TAS2764_TDM_CFG4_TX_RISING; break; } @@ -389,6 +391,12 @@ static int tas2764_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) if (ret < 0) return ret; + ret = snd_soc_component_update_bits(component, TAS2764_TDM_CFG4, + TAS2764_TDM_CFG4_TX_MASK, + asi_cfg_4); + if (ret < 0) + return ret; + switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { case SND_SOC_DAIFMT_I2S: asi_cfg_0 ^= TAS2764_TDM_CFG0_FRAME_START; diff --git a/sound/soc/codecs/tas2764.h b/sound/soc/codecs/tas2764.h index 168af772a898f..9490f2686e389 100644 --- a/sound/soc/codecs/tas2764.h +++ b/sound/soc/codecs/tas2764.h @@ -25,7 +25,7 @@ /* Power Control */ #define TAS2764_PWR_CTRL TAS2764_REG(0X0, 0x02) -#define TAS2764_PWR_CTRL_MASK GENMASK(1, 0) +#define TAS2764_PWR_CTRL_MASK GENMASK(2, 0) #define TAS2764_PWR_CTRL_ACTIVE 0x0 #define TAS2764_PWR_CTRL_MUTE BIT(0) #define TAS2764_PWR_CTRL_SHUTDOWN BIT(1) @@ -79,6 +79,12 @@ #define TAS2764_TDM_CFG3_RXS_SHIFT 0x4 #define TAS2764_TDM_CFG3_MASK GENMASK(3, 0) +/* TDM Configuration Reg4 */ +#define TAS2764_TDM_CFG4 TAS2764_REG(0X0, 0x0d) +#define TAS2764_TDM_CFG4_TX_MASK BIT(0) +#define TAS2764_TDM_CFG4_TX_RISING 0x0 +#define TAS2764_TDM_CFG4_TX_FALLING BIT(0) + /* TDM Configuration Reg5 */ #define TAS2764_TDM_CFG5 TAS2764_REG(0X0, 0x0e) #define TAS2764_TDM_CFG5_VSNS_MASK BIT(6) diff --git a/sound/soc/codecs/tas2770.c b/sound/soc/codecs/tas2770.c index 9f93b230652a5..863c3f672ba98 100644 --- a/sound/soc/codecs/tas2770.c +++ b/sound/soc/codecs/tas2770.c @@ -506,7 +506,7 @@ static int tas2770_codec_probe(struct snd_soc_component *component) } static DECLARE_TLV_DB_SCALE(tas2770_digital_tlv, 1100, 50, 0); -static DECLARE_TLV_DB_SCALE(tas2770_playback_volume, -12750, 50, 0); +static DECLARE_TLV_DB_SCALE(tas2770_playback_volume, -10050, 50, 0); static const struct snd_kcontrol_new tas2770_snd_controls[] = { SOC_SINGLE_TLV("Speaker Playback Volume", TAS2770_PLAY_CFG_REG2, diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index c4eb87c5d39e4..9f33dd11d47f6 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -994,10 +994,10 @@ static struct snd_soc_dai_driver fsl_sai_dai_template[] = { { .name = "sai-tx", .playback = { - .stream_name = "CPU-Playback", + .stream_name = "SAI-Playback", .channels_min = 1, .channels_max = 32, - .rate_min = 8000, + .rate_min = 8000, .rate_max = 2822400, .rates = SNDRV_PCM_RATE_KNOT, .formats = FSL_SAI_FORMATS, @@ -1007,7 +1007,7 @@ static struct snd_soc_dai_driver fsl_sai_dai_template[] = { { .name = "sai-rx", .capture = { - .stream_name = "CPU-Capture", + .stream_name = "SAI-Capture", .channels_min = 1, .channels_max = 32, .rate_min = 8000, diff --git a/sound/soc/fsl/imx-audmix.c b/sound/soc/fsl/imx-audmix.c index 50ecc5f51100e..dac5d4ddacd6e 100644 --- a/sound/soc/fsl/imx-audmix.c +++ b/sound/soc/fsl/imx-audmix.c @@ -119,8 +119,8 @@ static const struct snd_soc_ops imx_audmix_be_ops = { static const char *name[][3] = { {"HiFi-AUDMIX-FE-0", "HiFi-AUDMIX-FE-1", "HiFi-AUDMIX-FE-2"}, {"sai-tx", "sai-tx", "sai-rx"}, - {"AUDMIX-Playback-0", "AUDMIX-Playback-1", "CPU-Capture"}, - {"CPU-Playback", "CPU-Playback", "AUDMIX-Capture-0"}, + {"AUDMIX-Playback-0", "AUDMIX-Playback-1", "SAI-Capture"}, + {"SAI-Playback", "SAI-Playback", "AUDMIX-Capture-0"}, }; static int imx_audmix_probe(struct platform_device *pdev) diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c index 3391430e42532..83de3ae336910 100644 --- a/sound/soc/fsl/imx-pcm-fiq.c +++ b/sound/soc/fsl/imx-pcm-fiq.c @@ -185,8 +185,7 @@ static int snd_imx_open(struct snd_soc_component *component, atomic_set(&iprtd->playing, 0); atomic_set(&iprtd->capturing, 0); - hrtimer_init(&iprtd->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - iprtd->hrt.function = snd_hrtimer_callback; + hrtimer_setup(&iprtd->hrt, snd_hrtimer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ret = snd_pcm_hw_constraint_integer(substream->runtime, SNDRV_PCM_HW_PARAM_PERIODS); diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 203b07d4d833c..c13064c777261 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -803,7 +803,9 @@ static int create_sdw_dailink(struct snd_soc_card *card, int *be_id, struct snd_soc_codec_conf **codec_conf) { struct device *dev = card->dev; + struct snd_soc_acpi_mach *mach = dev_get_platdata(card->dev); struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card); + struct snd_soc_acpi_mach_params *mach_params = &mach->mach_params; struct intel_mc_ctx *intel_ctx = (struct intel_mc_ctx *)ctx->private; struct asoc_sdw_endpoint *sof_end; int stream; @@ -900,6 +902,11 @@ static int create_sdw_dailink(struct snd_soc_card *card, codecs[j].name = sof_end->codec_name; codecs[j].dai_name = sof_end->dai_info->dai_name; + if (sof_end->dai_info->dai_type == SOC_SDW_DAI_TYPE_MIC && + mach_params->dmic_num > 0) { + dev_warn(dev, + "Both SDW DMIC and PCH DMIC are present, if incorrect, please set kernel params snd_sof_intel_hda_generic dmic_num=0 to disable PCH DMIC\n"); + } j++; } diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index be689f6e10c81..a1ccd95da8bb7 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -1312,22 +1312,8 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev) /* report to machine driver if any DMICs are found */ mach->mach_params.dmic_num = check_dmic_num(sdev); - if (sdw_mach_found) { - /* - * DMICs use up to 4 pins and are typically pin-muxed with SoundWire - * link 2 and 3, or link 1 and 2, thus we only try to enable dmics - * if all conditions are true: - * a) 2 or fewer links are used by SoundWire - * b) the NHLT table reports the presence of microphones - */ - if (hweight_long(mach->link_mask) <= 2) - dmic_fixup = true; - else - mach->mach_params.dmic_num = 0; - } else { - if (mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER) - dmic_fixup = true; - } + if (sdw_mach_found || mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER) + dmic_fixup = true; if (tplg_fixup && dmic_fixup && diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 737dd00e97b14..779d97d31f170 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1145,7 +1145,7 @@ static int snd_usbmidi_output_close(struct snd_rawmidi_substream *substream) { struct usbmidi_out_port *port = substream->runtime->private_data; - cancel_work_sync(&port->ep->work); + flush_work(&port->ep->work); return substream_open(substream, 0, 0); } diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a97efb7b131ea..09210fb4ac60c 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1868,6 +1868,7 @@ void snd_usb_set_format_quirk(struct snd_usb_substream *subs, case USB_ID(0x534d, 0x2109): /* MacroSilicon MS2109 */ subs->stream_offset_adj = 2; break; + case USB_ID(0x2b73, 0x000a): /* Pioneer DJM-900NXS2 */ case USB_ID(0x2b73, 0x0013): /* Pioneer DJM-450 */ pioneer_djm_set_format_quirk(subs, 0x0082); break; diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c index 5f81c68fd42b6..5756ff3528a2d 100644 --- a/sound/usb/usx2y/usbusx2y.c +++ b/sound/usb/usx2y/usbusx2y.c @@ -151,6 +151,12 @@ static int snd_usx2y_card_used[SNDRV_CARDS]; static void snd_usx2y_card_private_free(struct snd_card *card); static void usx2y_unlinkseq(struct snd_usx2y_async_seq *s); +#ifdef USX2Y_NRPACKS_VARIABLE +int nrpacks = USX2Y_NRPACKS; /* number of packets per urb */ +module_param(nrpacks, int, 0444); +MODULE_PARM_DESC(nrpacks, "Number of packets per URB."); +#endif + /* * pipe 4 is used for switching the lamps, setting samplerate, volumes .... */ @@ -432,6 +438,11 @@ static int snd_usx2y_probe(struct usb_interface *intf, struct snd_card *card; int err; +#ifdef USX2Y_NRPACKS_VARIABLE + if (nrpacks < 0 || nrpacks > USX2Y_NRPACKS_MAX) + return -EINVAL; +#endif + if (le16_to_cpu(device->descriptor.idVendor) != 0x1604 || (le16_to_cpu(device->descriptor.idProduct) != USB_ID_US122 && le16_to_cpu(device->descriptor.idProduct) != USB_ID_US224 && diff --git a/sound/usb/usx2y/usbusx2y.h b/sound/usb/usx2y/usbusx2y.h index 391fd7b4ed5ef..6a76d04bf1c7d 100644 --- a/sound/usb/usx2y/usbusx2y.h +++ b/sound/usb/usx2y/usbusx2y.h @@ -7,6 +7,32 @@ #define NRURBS 2 +/* Default value used for nr of packs per urb. + * 1 to 4 have been tested ok on uhci. + * To use 3 on ohci, you'd need a patch: + * look for "0000425-linux-2.6.9-rc4-mm1_ohci-hcd.patch.gz" on + * "https://bugtrack.alsa-project.org/alsa-bug/bug_view_page.php?bug_id=0000425" + * + * 1, 2 and 4 work out of the box on ohci, if I recall correctly. + * Bigger is safer operation, smaller gives lower latencies. + */ +#define USX2Y_NRPACKS 4 + +#define USX2Y_NRPACKS_MAX 1024 + +/* If your system works ok with this module's parameter + * nrpacks set to 1, you might as well comment + * this define out, and thereby produce smaller, faster code. + * You'd also set USX2Y_NRPACKS to 1 then. + */ +#define USX2Y_NRPACKS_VARIABLE 1 + +#ifdef USX2Y_NRPACKS_VARIABLE +extern int nrpacks; +#define nr_of_packs() nrpacks +#else +#define nr_of_packs() USX2Y_NRPACKS +#endif #define URBS_ASYNC_SEQ 10 #define URB_DATA_LEN_ASYNC_SEQ 32 diff --git a/sound/usb/usx2y/usbusx2yaudio.c b/sound/usb/usx2y/usbusx2yaudio.c index f540f46a0b143..acca8bead82e5 100644 --- a/sound/usb/usx2y/usbusx2yaudio.c +++ b/sound/usb/usx2y/usbusx2yaudio.c @@ -28,33 +28,6 @@ #include "usx2y.h" #include "usbusx2y.h" -/* Default value used for nr of packs per urb. - * 1 to 4 have been tested ok on uhci. - * To use 3 on ohci, you'd need a patch: - * look for "0000425-linux-2.6.9-rc4-mm1_ohci-hcd.patch.gz" on - * "https://bugtrack.alsa-project.org/alsa-bug/bug_view_page.php?bug_id=0000425" - * - * 1, 2 and 4 work out of the box on ohci, if I recall correctly. - * Bigger is safer operation, smaller gives lower latencies. - */ -#define USX2Y_NRPACKS 4 - -/* If your system works ok with this module's parameter - * nrpacks set to 1, you might as well comment - * this define out, and thereby produce smaller, faster code. - * You'd also set USX2Y_NRPACKS to 1 then. - */ -#define USX2Y_NRPACKS_VARIABLE 1 - -#ifdef USX2Y_NRPACKS_VARIABLE -static int nrpacks = USX2Y_NRPACKS; /* number of packets per urb */ -#define nr_of_packs() nrpacks -module_param(nrpacks, int, 0444); -MODULE_PARM_DESC(nrpacks, "Number of packets per URB."); -#else -#define nr_of_packs() USX2Y_NRPACKS -#endif - static int usx2y_urb_capt_retire(struct snd_usx2y_substream *subs) { struct urb *urb = subs->completed_urb; diff --git a/tools/arch/arm64/tools/Makefile b/tools/arch/arm64/tools/Makefile index 7b42feedf6471..de4f1b66ef014 100644 --- a/tools/arch/arm64/tools/Makefile +++ b/tools/arch/arm64/tools/Makefile @@ -13,12 +13,6 @@ AWK ?= awk MKDIR ?= mkdir RM ?= rm -ifeq ($(V),1) -Q = -else -Q = @ -endif - arm64_tools_dir = $(top_srcdir)/arch/arm64/tools arm64_sysreg_tbl = $(arm64_tools_dir)/sysreg arm64_gen_sysreg = $(arm64_tools_dir)/gen-sysreg.awk diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 93807b437e4de..cb1740bc3da2d 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 17b6590748c00..9e3fa7942e7d3 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -2,14 +2,6 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include -#endif - /* * Defines x86 CPU feature bits */ @@ -210,7 +202,6 @@ #define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index c492bdc97b059..0000000000000 --- a/tools/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_MITIGATION_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_MITIGATION_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_MITIGATION_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING -# define DISABLE_CALL_DEPTH_TRACKING 0 -#else -# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -#endif - -#ifdef CONFIG_ADDRESS_MASKING -# define DISABLE_LAM 0 -#else -# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_XEN_PV -# define DISABLE_XENPV 0 -#else -# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -#ifdef CONFIG_X86_USER_SHADOW_STACK -#define DISABLE_USER_SHSTK 0 -#else -#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) -#endif - -#ifdef CONFIG_X86_KERNEL_IBT -#define DISABLE_IBT 0 -#else -#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) -#endif - -#ifdef CONFIG_X86_FRED -# define DISABLE_FRED 0 -#else -# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) -#endif - -#ifdef CONFIG_KVM_AMD_SEV -#define DISABLE_SEV_SNP 0 -#else -#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 (DISABLE_SEV_SNP) -#define DISABLED_MASK20 0 -#define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3ae84c3b8e6db..dc1c1057f26e4 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) @@ -34,6 +35,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h deleted file mode 100644 index e9187ddd3d1fd..0000000000000 --- a/tools/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK20 0 -#define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 243b79f2b451e..062bbd6cd048e 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -27,12 +27,6 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -ifeq ($(V),1) - Q = -else - Q = @ -endif - FEATURE_USER = .bpf FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled FEATURE_DISPLAY = libbfd diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 4315652678b9f..bf843f328812e 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -5,12 +5,6 @@ INSTALL ?= install RM ?= rm -f RMDIR ?= rmdir --ignore-fail-on-non-empty -ifeq ($(V),1) - Q = -else - Q = @ -endif - prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index dd9f3ec842017..6ea4823b770cb 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -7,12 +7,6 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -ifeq ($(V),1) - Q = -else - Q = @ -endif - BPF_DIR = $(srctree)/tools/lib/bpf ifneq ($(OUTPUT),) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 4b8079f294f65..afbddea3a39c6 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -5,10 +5,8 @@ include ../../scripts/Makefile.arch srctree := $(abspath $(CURDIR)/../../../) ifeq ($(V),1) - Q = msg = else - Q = @ ifeq ($(silent),1) msg = else diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 5613b5736d938..78a436c4072e3 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -27,10 +27,7 @@ VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) -ifeq ($(V),1) -Q = -else -Q = @ +ifneq ($(V),1) MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 endif diff --git a/tools/build/Makefile b/tools/build/Makefile index 18ad131f6ea74..63ef218787616 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -17,13 +17,7 @@ $(call allow-override,LD,$(CROSS_COMPILE)ld) export HOSTCC HOSTLD HOSTAR -ifeq ($(V),1) - Q = -else - Q = @ -endif - -export Q srctree CC LD +export srctree CC LD MAKEFLAGS := --no-print-directory build := -f $(srctree)/tools/build/Makefile.build dir=. obj diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index a1f55fb24bb38..c1299a0531457 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -30,6 +30,7 @@ all_files := \ crt.h \ ctype.h \ errno.h \ + limits.h \ nolibc.h \ signal.h \ stackprotector.h \ diff --git a/tools/include/nolibc/limits.h b/tools/include/nolibc/limits.h new file mode 100644 index 0000000000000..306d4141f4d24 --- /dev/null +++ b/tools/include/nolibc/limits.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Shim limits.h header for NOLIBC. + * Copyright (C) 2025 Thomas Weißschuh + */ + +#include "nolibc.h" diff --git a/tools/include/uapi/linux/elf.h b/tools/include/uapi/linux/elf.h new file mode 100644 index 0000000000000..5834b83d7f9a4 --- /dev/null +++ b/tools/include/uapi/linux/elf.h @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_ELF_H +#define _LINUX_ELF_H + +#include +#include + +/* 32-bit ELF base types. */ +typedef __u32 Elf32_Addr; +typedef __u16 Elf32_Half; +typedef __u32 Elf32_Off; +typedef __s32 Elf32_Sword; +typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; + +/* 64-bit ELF base types. */ +typedef __u64 Elf64_Addr; +typedef __u16 Elf64_Half; +typedef __s16 Elf64_SHalf; +typedef __u64 Elf64_Off; +typedef __s32 Elf64_Sword; +typedef __u32 Elf64_Word; +typedef __u64 Elf64_Xword; +typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME (PT_LOOS + 0x474e550) +#define PT_GNU_STACK (PT_LOOS + 0x474e551) +#define PT_GNU_RELRO (PT_LOOS + 0x474e552) +#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553) + + +/* ARM MTE memory tag segment type */ +#define PT_AARCH64_MEMTAG_MTE (PT_LOPROC + 0x2) + +/* + * Extended Numbering + * + * If the real number of program header table entries is larger than + * or equal to PN_XNUM(0xffff), it is set to sh_info field of the + * section header at index 0, and PN_XNUM is set to e_phnum + * field. Otherwise, the section header at index 0 is zero + * initialized, if it exists. + * + * Specifications are available in: + * + * - Oracle: Linker and Libraries. + * Part No: 817–1984–19, August 2011. + * https://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf + * + * - System V ABI AMD64 Architecture Processor Supplement + * Draft Version 0.99.4, + * January 13, 2010. + * http://www.cs.washington.edu/education/courses/cse351/12wi/supp-docs/abi.pdf + */ +#define PN_XNUM 0xffff + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* This is the info that is needed to parse the dynamic section of the file */ +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_ENCODING 32 +#define OLD_DT_LOOS 0x60000000 +#define DT_LOOS 0x6000000d +#define DT_HIOS 0x6ffff000 +#define DT_VALRNGLO 0x6ffffd00 +#define DT_VALRNGHI 0x6ffffdff +#define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 +#define DT_ADDRRNGHI 0x6ffffeff +#define DT_VERSYM 0x6ffffff0 +#define DT_RELACOUNT 0x6ffffff9 +#define DT_RELCOUNT 0x6ffffffa +#define DT_FLAGS_1 0x6ffffffb +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd +#define DT_VERNEED 0x6ffffffe +#define DT_VERNEEDNUM 0x6fffffff +#define OLD_DT_HIOS 0x6fffffff +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7fffffff + +/* This info is needed when parsing the symbol table */ +#define STB_LOCAL 0 +#define STB_GLOBAL 1 +#define STB_WEAK 2 + +#define STN_UNDEF 0 + +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 + +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + +#define ELF_ST_BIND(x) ((x) >> 4) +#define ELF_ST_TYPE(x) ((x) & 0xf) +#define ELF32_ST_BIND(x) ELF_ST_BIND(x) +#define ELF32_ST_TYPE(x) ELF_ST_TYPE(x) +#define ELF64_ST_BIND(x) ELF_ST_BIND(x) +#define ELF64_ST_TYPE(x) ELF_ST_TYPE(x) + +typedef struct { + Elf32_Sword d_tag; + union { + Elf32_Sword d_val; + Elf32_Addr d_ptr; + } d_un; +} Elf32_Dyn; + +typedef struct { + Elf64_Sxword d_tag; /* entry tag value */ + union { + Elf64_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +} Elf64_Dyn; + +/* The following are used with relocations */ +#define ELF32_R_SYM(x) ((x) >> 8) +#define ELF32_R_TYPE(x) ((x) & 0xff) + +#define ELF64_R_SYM(i) ((i) >> 32) +#define ELF64_R_TYPE(i) ((i) & 0xffffffff) + +typedef struct elf32_rel { + Elf32_Addr r_offset; + Elf32_Word r_info; +} Elf32_Rel; + +typedef struct elf64_rel { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ +} Elf64_Rel; + +typedef struct elf32_rela { + Elf32_Addr r_offset; + Elf32_Word r_info; + Elf32_Sword r_addend; +} Elf32_Rela; + +typedef struct elf64_rela { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ + Elf64_Sxword r_addend; /* Constant addend used to compute value */ +} Elf64_Rela; + +typedef struct elf32_sym { + Elf32_Word st_name; + Elf32_Addr st_value; + Elf32_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf32_Half st_shndx; +} Elf32_Sym; + +typedef struct elf64_sym { + Elf64_Word st_name; /* Symbol name, index in string tbl */ + unsigned char st_info; /* Type and binding attributes */ + unsigned char st_other; /* No defined meaning, 0 */ + Elf64_Half st_shndx; /* Associated section index */ + Elf64_Addr st_value; /* Value of the symbol */ + Elf64_Xword st_size; /* Associated symbol size */ +} Elf64_Sym; + + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[EI_NIDENT]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +/* sh_type */ +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_NUM 12 +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7fffffff +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xffffffff + +/* sh_flags */ +#define SHF_WRITE 0x1 +#define SHF_ALLOC 0x2 +#define SHF_EXECINSTR 0x4 +#define SHF_RELA_LIVEPATCH 0x00100000 +#define SHF_RO_AFTER_INIT 0x00200000 +#define SHF_MASKPROC 0xf0000000 + +/* special section indexes */ +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xff00 +#define SHN_LOPROC 0xff00 +#define SHN_HIPROC 0xff1f +#define SHN_LIVEPATCH 0xff20 +#define SHN_ABS 0xfff1 +#define SHN_COMMON 0xfff2 +#define SHN_HIRESERVE 0xffff + +typedef struct elf32_shdr { + Elf32_Word sh_name; + Elf32_Word sh_type; + Elf32_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf32_Word sh_size; + Elf32_Word sh_link; + Elf32_Word sh_info; + Elf32_Word sh_addralign; + Elf32_Word sh_entsize; +} Elf32_Shdr; + +typedef struct elf64_shdr { + Elf64_Word sh_name; /* Section name, index in string tbl */ + Elf64_Word sh_type; /* Type of section */ + Elf64_Xword sh_flags; /* Miscellaneous section attributes */ + Elf64_Addr sh_addr; /* Section virtual addr at execution */ + Elf64_Off sh_offset; /* Section file offset */ + Elf64_Xword sh_size; /* Size of section in bytes */ + Elf64_Word sh_link; /* Index of another section */ + Elf64_Word sh_info; /* Additional section information */ + Elf64_Xword sh_addralign; /* Section alignment */ + Elf64_Xword sh_entsize; /* Entry size if section holds table */ +} Elf64_Shdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#ifndef ELF_OSABI +#define ELF_OSABI ELFOSABI_NONE +#endif + +/* + * Notes used in ET_CORE. Architectures export some of the arch register sets + * using the corresponding note types via the PTRACE_GETREGSET and + * PTRACE_SETREGSET requests. + * The note name for these types is "LINUX", except NT_PRFPREG that is named + * "CORE". + */ +#define NT_PRSTATUS 1 +#define NT_PRFPREG 2 +#define NT_PRPSINFO 3 +#define NT_TASKSTRUCT 4 +#define NT_AUXV 6 +/* + * Note to userspace developers: size of NT_SIGINFO note may increase + * in the future to accomodate more fields, don't assume it is fixed! + */ +#define NT_SIGINFO 0x53494749 +#define NT_FILE 0x46494c45 +#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +/* Old binutils treats 0x203 as a CET state */ +#define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ + +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + +/* Note header in a PT_NOTE section */ +typedef struct elf32_note { + Elf32_Word n_namesz; /* Name size */ + Elf32_Word n_descsz; /* Content size */ + Elf32_Word n_type; /* Content type */ +} Elf32_Nhdr; + +/* Note header in a PT_NOTE section */ +typedef struct elf64_note { + Elf64_Word n_namesz; /* Name size */ + Elf64_Word n_descsz; /* Content size */ + Elf64_Word n_type; /* Content type */ +} Elf64_Nhdr; + +/* .note.gnu.property types for EM_AARCH64: */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) + +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + +#endif /* _LINUX_ELF_H */ diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 857a5f7b413d6..168140f8e6461 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -53,13 +53,6 @@ include $(srctree)/tools/scripts/Makefile.include # copy a bit from Linux kbuild -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - INCLUDES = -I$(or $(OUTPUT),.) \ -I$(srctree)/tools/include -I$(srctree)/tools/include/uapi \ -I$(srctree)/tools/arch/$(SRCARCH)/include @@ -96,12 +89,6 @@ override CFLAGS += $(CLANG_CROSS_FLAGS) # flags specific for shared library SHLIB_FLAGS := -DSHARED -fPIC -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Disable command line variables (CFLAGS) override from top # level Makefile (perf), otherwise build Makefile will get # the same command line setup. diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 3a9b2140aa048..e9a7ac2c062e2 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - TEST_ARGS := $(if $(V),-v) # Set compile option CFLAGS diff --git a/tools/lib/thermal/Makefile b/tools/lib/thermal/Makefile index 8890fd57b110c..a1f5e388644d3 100644 --- a/tools/lib/thermal/Makefile +++ b/tools/lib/thermal/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Set compile option CFLAGS ifdef EXTRA_CFLAGS CFLAGS := $(EXTRA_CFLAGS) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index f56e277275341..7a65948892e56 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -46,12 +46,6 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" AWK = awk MKDIR = mkdir -ifeq ($(V),1) - Q = -else - Q = @ -endif - BUILD_ORC := n ifeq ($(SRCARCH),x86) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index fe1362c345647..181aa604f8cdf 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -850,5 +850,6 @@ bool arch_is_rethunk(struct symbol *sym) bool arch_is_embedded_insn(struct symbol *sym) { return !strcmp(sym->name, "retbleed_return_thunk") || + !strcmp(sym->name, "srso_alias_safe_ret") || !strcmp(sym->name, "srso_safe_ret"); } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index be18a04893030..814d40724069d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1284,15 +1284,6 @@ static void annotate_call_site(struct objtool_file *file, if (!sym) sym = reloc->sym; - /* - * Alternative replacement code is just template code which is - * sometimes copied to the original instruction. For now, don't - * annotate it. (In the future we might consider annotating the - * original instruction if/when it ever makes sense to do so.) - */ - if (!strcmp(insn->sec->name, ".altinstr_replacement")) - return; - if (sym->static_call_tramp) { list_add_tail(&insn->call_node, &file->static_call_list); return; @@ -1350,7 +1341,8 @@ static void annotate_call_site(struct objtool_file *file, return; } - if (insn->type == INSN_CALL && !insn->sec->init) + if (insn->type == INSN_CALL && !insn->sec->init && + !insn->_call_dest->embedded_insn) list_add_tail(&insn->call_node, &file->call_list); if (!sibling && dead_end_function(file, sym)) @@ -2472,13 +2464,14 @@ static void mark_rodata(struct objtool_file *file) * * - .rodata: can contain GCC switch tables * - .rodata.: same, if -fdata-sections is being used - * - .rodata..c_jump_table: contains C annotated jump tables + * - .data.rel.ro.c_jump_table: contains C annotated jump tables * * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if (!strncmp(sec->name, ".rodata", 7) && - !strstr(sec->name, ".str1.")) { + if ((!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) || + !strncmp(sec->name, ".data.rel.ro", 12)) { sec->rodata = true; found = true; } diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index e7ee7ffccefd4..e049679bb17b2 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -10,7 +10,7 @@ #include #include -#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" +#define C_JUMP_TABLE_SECTION ".data.rel.ro.c_jump_table" struct special_alt { struct list_head list; diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index b2174894f9f71..eacfe3b0a8d1d 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -16,10 +16,11 @@ NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) NORETURN(__x64_sys_exit) NORETURN(__x64_sys_exit_group) +NORETURN(acpi_processor_ffh_play_dead) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) -NORETURN(bch2_trans_unlocked_error) +NORETURN(bch2_trans_unlocked_or_in_restart_error) NORETURN(cpu_bringup_and_idle) NORETURN(cpu_startup_entry) NORETURN(do_exit) @@ -34,6 +35,7 @@ NORETURN(kunit_try_catch_throw) NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) +NORETURN(mwait_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(panic_smp_self_stop) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 55d6ce9ea52fb..05c083bb11220 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -161,47 +161,6 @@ export VPATH SOURCE := $(shell ln -sf $(srctree)/tools/perf $(OUTPUT)/source) endif -# Beautify output -# --------------------------------------------------------------------------- -# -# Most of build commands in Kbuild start with "cmd_". You can optionally define -# "quiet_cmd_*". If defined, the short log is printed. Otherwise, no log from -# that command is printed by default. -# -# e.g.) -# quiet_cmd_depmod = DEPMOD $(MODLIB) -# cmd_depmod = $(srctree)/scripts/depmod.sh $(DEPMOD) $(KERNELRELEASE) -# -# A simple variant is to prefix commands with $(Q) - that's useful -# for commands that shall be hidden in non-verbose mode. -# -# $(Q)$(MAKE) $(build)=scripts/basic -# -# To put more focus on warnings, be less verbose as default -# Use 'make V=1' to see the full commands - -ifeq ($(V),1) - quiet = - Q = -else - quiet=quiet_ - Q=@ -endif - -# If the user is running make -s (silent mode), suppress echoing of commands -# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. -ifeq ($(filter 3.%,$(MAKE_VERSION)),) -short-opts := $(firstword -$(MAKEFLAGS)) -else -short-opts := $(filter-out --%,$(MAKEFLAGS)) -endif - -ifneq ($(findstring s,$(short-opts)),) - quiet=silent_ -endif - -export quiet Q - # Do not use make's built-in rules # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index d3c6e10dce73f..a4499e5a6f9cb 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -26,8 +26,6 @@ FILES=( "include/linux/hash.h" "include/linux/list-sort.h" "include/uapi/linux/hw_breakpoint.h" - "arch/x86/include/asm/disabled-features.h" - "arch/x86/include/asm/required-features.h" "arch/x86/include/asm/cpufeatures.h" "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d5011a0bf60d..26057af6b5a1f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1056,7 +1056,7 @@ static const struct platform_data turbostat_pdata[] = { * Missing support for * INTEL_ICELAKE * INTEL_ATOM_SILVERMONT_MID - * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_SILVERMONT_MID2 * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 0aa4005017c72..45f4abef70640 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -136,6 +136,33 @@ else NO_SUBDIR = : endif +# Beautify output +# --------------------------------------------------------------------------- +# +# Most of build commands in Kbuild start with "cmd_". You can optionally define +# "quiet_cmd_*". If defined, the short log is printed. Otherwise, no log from +# that command is printed by default. +# +# e.g.) +# quiet_cmd_depmod = DEPMOD $(MODLIB) +# cmd_depmod = $(srctree)/scripts/depmod.sh $(DEPMOD) $(KERNELRELEASE) +# +# A simple variant is to prefix commands with $(Q) - that's useful +# for commands that shall be hidden in non-verbose mode. +# +# $(Q)$(MAKE) $(build)=scripts/basic +# +# To put more focus on warnings, be less verbose as default +# Use 'make V=1' to see the full commands + +ifeq ($(V),1) + quiet = + Q = +else + quiet = quiet_ + Q = @ +endif + # If the user is running make -s (silent mode), suppress echoing of commands # make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. ifeq ($(filter 3.%,$(MAKE_VERSION)),) @@ -146,8 +173,11 @@ endif ifneq ($(findstring s,$(short-opts)),) silent=1 + quiet=silent_ endif +export quiet Q + # # Define a callable command for descending to a new directory # diff --git a/tools/sound/dapm-graph b/tools/sound/dapm-graph index f14bdfedee8f1..b6196ee5065a4 100755 --- a/tools/sound/dapm-graph +++ b/tools/sound/dapm-graph @@ -10,7 +10,7 @@ set -eu STYLE_COMPONENT_ON="color=dodgerblue;style=bold" STYLE_COMPONENT_OFF="color=gray40;style=filled;fillcolor=gray90" -STYLE_NODE_ON="shape=box,style=bold,color=green4" +STYLE_NODE_ON="shape=box,style=bold,color=green4,fillcolor=white" STYLE_NODE_OFF="shape=box,style=filled,color=gray30,fillcolor=gray95" # Print usage and exit diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index eb6a4fea8c794..f7f9e7088bb38 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -7,12 +7,6 @@ INSTALL ?= install RM ?= rm -f RMDIR ?= rmdir --ignore-fail-on-non-empty -ifeq ($(V),1) - Q = -else - Q = @ -endif - prefix ?= /usr/local mandir ?= $(prefix)/man man2dir = $(mandir)/man2 diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 1bd403a5ef7b3..0fd8c9b0d38f6 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -526,6 +526,12 @@ extern const struct bench bench_trig_uprobe_multi_push; extern const struct bench bench_trig_uretprobe_multi_push; extern const struct bench bench_trig_uprobe_multi_ret; extern const struct bench bench_trig_uretprobe_multi_ret; +#ifdef __x86_64__ +extern const struct bench bench_trig_uprobe_nop5; +extern const struct bench bench_trig_uretprobe_nop5; +extern const struct bench bench_trig_uprobe_multi_nop5; +extern const struct bench bench_trig_uretprobe_multi_nop5; +#endif extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; @@ -586,6 +592,12 @@ static const struct bench *benchs[] = { &bench_trig_uretprobe_multi_push, &bench_trig_uprobe_multi_ret, &bench_trig_uretprobe_multi_ret, +#ifdef __x86_64__ + &bench_trig_uprobe_nop5, + &bench_trig_uretprobe_nop5, + &bench_trig_uprobe_multi_nop5, + &bench_trig_uretprobe_multi_nop5, +#endif /* ringbuf/perfbuf benchmarks */ &bench_rb_libbpf, &bench_rb_custom, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 32e9f194d4497..82327657846e5 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -333,6 +333,20 @@ static void *uprobe_producer_ret(void *input) return NULL; } +#ifdef __x86_64__ +__nocf_check __weak void uprobe_target_nop5(void) +{ + asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00"); +} + +static void *uprobe_producer_nop5(void *input) +{ + while (true) + uprobe_target_nop5(); + return NULL; +} +#endif + static void usetup(bool use_retprobe, bool use_multi, void *target_addr) { size_t uprobe_offset; @@ -448,6 +462,28 @@ static void uretprobe_multi_ret_setup(void) usetup(true, true /* use_multi */, &uprobe_target_ret); } +#ifdef __x86_64__ +static void uprobe_nop5_setup(void) +{ + usetup(false, false /* !use_multi */, &uprobe_target_nop5); +} + +static void uretprobe_nop5_setup(void) +{ + usetup(true, false /* !use_multi */, &uprobe_target_nop5); +} + +static void uprobe_multi_nop5_setup(void) +{ + usetup(false, true /* use_multi */, &uprobe_target_nop5); +} + +static void uretprobe_multi_nop5_setup(void) +{ + usetup(true, true /* use_multi */, &uprobe_target_nop5); +} +#endif + const struct bench bench_trig_syscall_count = { .name = "trig-syscall-count", .validate = trigger_validate, @@ -506,3 +542,9 @@ BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret"); BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret"); +#ifdef __x86_64__ +BENCH_TRIG_USERMODE(uprobe_nop5, nop5, "uprobe-nop5"); +BENCH_TRIG_USERMODE(uretprobe_nop5, nop5, "uretprobe-nop5"); +BENCH_TRIG_USERMODE(uprobe_multi_nop5, nop5, "uprobe-multi-nop5"); +BENCH_TRIG_USERMODE(uretprobe_multi_nop5, nop5, "uretprobe-multi-nop5"); +#endif diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh index af169f831f2f2..03f55405484b2 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh @@ -2,7 +2,7 @@ set -eufo pipefail -for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} +for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret,nop5} do summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) printf "%-15s: %s\n" $i "$summary" diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index c397336fe1ed1..9c7e31bb20dcb 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -4,6 +4,8 @@ #ifdef __x86_64__ +#pragma GCC diagnostic ignored "-Wattributes" + #include #include #include @@ -14,16 +16,41 @@ #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "sdt.h" + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +/* + * Compiler (gcc) can ignore __naked attribute and place endbr64 at the + * begining of the function, so let's just scan few more bytes to make + * sure we have nop5 addres/offset. + */ +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +static unsigned long get_nop5_offset(void *fn, void **paddr) +{ + return get_uprobe_offset(fn); +} -__naked unsigned long uretprobe_regs_trigger(void) +__nocf_check __weak __naked long uprobe_regs_trigger(void) { asm volatile ( - "movq $0xdeadbeef, %rax\n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "movq $0xdeadbeef, %rax \n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__nocf_check __weak __naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -52,7 +79,7 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +119,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_nop5_offset(&uprobe_regs_trigger, NULL); + if (!ASSERT_GE(offset, 0, "get_nop5_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +158,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +172,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call */ switch (offset) { /* @@ -146,6 +185,10 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + case offsetof(struct pt_regs, rax): + /* uprobe does not see return value */ + if (!retprobe) + break; default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +218,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_uprobe_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +226,18 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_nop5_offset(&uprobe_regs_trigger, NULL); + if (!ASSERT_GE(offset, 0, "get_nop5_offset")) + return; err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -219,7 +267,7 @@ static void test_uretprobe_regs_change(void) #define __NR_uretprobe 335 #endif -__naked unsigned long uretprobe_syscall_call_1(void) +__nocf_check __weak __naked unsigned long uretprobe_syscall_call_1(void) { /* * Pretend we are uretprobe trampoline to trigger the return @@ -237,7 +285,7 @@ __naked unsigned long uretprobe_syscall_call_1(void) ); } -__naked unsigned long uretprobe_syscall_call(void) +__nocf_check __weak __naked unsigned long uretprobe_syscall_call(void) { asm volatile ( "call uretprobe_syscall_call_1\n" @@ -277,10 +325,10 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->links.test_uretprobe_multi = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(skel->links.test_uretprobe_multi, "bpf_program__attach_uprobe_multi")) goto cleanup; /* kick the child */ @@ -344,42 +392,350 @@ static void test_uretprobe_shadow_stack(void) } /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + test_uprobe_regs_equal(false); test_uretprobe_syscall_call(); + test_uprobe_regs_change(); ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +#define TRAMP "[uprobes-trampoline]" + +__nocf_check __weak __naked void uprobe_test(void) { - test__skip(); + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); } -static void test_uretprobe_regs_change(void) +noinline void usdt_test(void) { - test__skip(); + STAP_PROBE(optimized_uprobe, usdt); } -static void test_uretprobe_syscall_call(void) +static int find_uprobes_trampoline(void **start, void **end) { - test__skip(); + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", start, end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; } -static void test_uretprobe_shadow_stack(void) +static void check_attach(struct uprobe_syscall_executed *skel, void (*trigger)(void), void *addr) { - test__skip(); + void *tramp_start, *tramp_end; + struct __arch_relative_insn { + u8 op; + s32 raddr; + } __packed *call; + + s32 delta; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + if (!ASSERT_OK(find_uprobes_trampoline(&tramp_start, &tramp_end), "uprobes_trampoline")) + return; + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, 2, "executed"); + + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + delta = (unsigned long) tramp_start - ((unsigned long) addr + 5); + + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_EQ(call->raddr, delta, "delta"); + ASSERT_EQ(tramp_end - tramp_start, 4096, "size"); } + +static void check_detach(struct uprobe_syscall_executed *skel, void (*trigger)(void), void *addr) +{ + void *tramp_start, *tramp_end; + + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(find_uprobes_trampoline(&tramp_start, &tramp_end), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + void (*trigger)(void), void *addr) +{ + check_attach(skel, trigger, addr); + bpf_link__destroy(link); + check_detach(skel, trigger, addr); +} + +static void test_uprobe_legacy(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + unsigned long offset; + void *addr; + + offset = get_nop5_offset(&uprobe_test, &addr); + if (!ASSERT_GE(offset, 0, "get_nop5_offset")) + return; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, addr); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, addr); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + unsigned long offset; + void *addr; + + /* uprobe */ + offset = get_nop5_offset(&uprobe_test, &addr); + if (!ASSERT_GE(offset, 0, "get_nop5_offset")) + return; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, addr); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, addr); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static volatile bool race_stop; + +static void *worker_trigger(void *arg) +{ + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; +} + +static void *worker_attach(void *arg) +{ + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + + offset = get_nop5_offset(&uprobe_test, NULL); + if (!ASSERT_GE(offset, 0, "get_nop5_offset")) + return NULL; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + return NULL; +} + +static void test_uprobe_race(void) +{ + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GE(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + + threads = malloc(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + sleep(4); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); +} + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_sigill(void) +{ + int status, err, pid; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork")) + return; + /* child */ + if (pid == 0) { + asm volatile ( + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "popq %r11\n" + "popq %rcx\n" + "retq\n" + ); + exit(0); + } + + err = waitpid(pid, &status, 0); + ASSERT_EQ(err, pid, "waitpid"); + + /* verify the child got killed with SIGILL */ + ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); + ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_sigill")) + test_uprobe_sigill(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("uprobe_regs_change")) + test_uprobe_regs_change(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 495d66414b57e..3a5b5230bfa07 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -419,7 +427,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef590..e08c31669e5a7 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2e..802fd562ce4c0 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -9,9 +11,37 @@ char _license[] SEC("license") = "GPL"; int executed = 0; +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + executed++; + return 0; +} + SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) { - executed = 1; + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h index 1fcfa5160231d..1d62c06f5ddcb 100644 --- a/tools/testing/selftests/bpf/sdt.h +++ b/tools/testing/selftests/bpf/sdt.h @@ -236,6 +236,13 @@ __extension__ extern unsigned long long __sdt_unsp; #define _SDT_NOP nop #endif +/* Use 5 byte nop for x86_64 to allow optimizing uprobes. */ +#if defined(__x86_64__) +# define _SDT_DEF_NOP _SDT_ASM_5(990: .byte 0x0f, 0x1f, 0x44, 0x00, 0x00) +#else +# define _SDT_DEF_NOP _SDT_ASM_1(990: _SDT_NOP) +#endif + #define _SDT_NOTE_NAME "stapsdt" #define _SDT_NOTE_TYPE 3 @@ -288,7 +295,7 @@ __extension__ extern unsigned long long __sdt_unsp; #define _SDT_ASM_BODY(provider, name, pack_args, args, ...) \ _SDT_DEF_MACROS \ - _SDT_ASM_1(990: _SDT_NOP) \ + _SDT_DEF_NOP \ _SDT_ASM_3( .pushsection .note.stapsdt,_SDT_ASM_AUTOGROUP,"note") \ _SDT_ASM_1( .balign 4) \ _SDT_ASM_3( .4byte 992f-991f, 994f-993f, _SDT_NOTE_TYPE) \ diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 3220f1d28697e..08494fcf6a58c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -496,15 +496,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -516,6 +522,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py index 2e8a74aff5431..58f3291fed12a 100755 --- a/tools/testing/selftests/damon/damon_nr_regions.py +++ b/tools/testing/selftests/damon/damon_nr_regions.py @@ -65,6 +65,7 @@ def test_nr_regions(real_nr_regions, min_nr_regions, max_nr_regions): test_name = 'nr_regions test with %d/%d/%d real/min/max nr_regions' % ( real_nr_regions, min_nr_regions, max_nr_regions) + collected_nr_regions.sort() if (collected_nr_regions[0] < min_nr_regions or collected_nr_regions[-1] > max_nr_regions): print('fail %s' % test_name) @@ -109,6 +110,7 @@ def main(): attrs = kdamonds.kdamonds[0].contexts[0].monitoring_attrs attrs.min_nr_regions = 3 attrs.max_nr_regions = 7 + attrs.update_us = 100000 err = kdamonds.kdamonds[0].commit() if err is not None: proc.terminate() diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py index 7d4c6bb2e3cd2..57c4937aaed28 100755 --- a/tools/testing/selftests/damon/damos_quota.py +++ b/tools/testing/selftests/damon/damos_quota.py @@ -51,16 +51,19 @@ def main(): nr_quota_exceeds = scheme.stats.qt_exceeds wss_collected.sort() + nr_expected_quota_exceeds = 0 for wss in wss_collected: if wss > sz_quota: print('quota is not kept: %s > %s' % (wss, sz_quota)) print('collected samples are as below') print('\n'.join(['%d' % wss for wss in wss_collected])) exit(1) + if wss == sz_quota: + nr_expected_quota_exceeds += 1 - if nr_quota_exceeds < len(wss_collected): - print('quota is not always exceeded: %d > %d' % - (len(wss_collected), nr_quota_exceeds)) + if nr_quota_exceeds < nr_expected_quota_exceeds: + print('quota is exceeded less than expected: %d < %d' % + (nr_quota_exceeds, nr_expected_quota_exceeds)) exit(1) if __name__ == '__main__': diff --git a/tools/testing/selftests/damon/damos_quota_goal.py b/tools/testing/selftests/damon/damos_quota_goal.py index 18246f3b62f7e..f76e0412b564c 100755 --- a/tools/testing/selftests/damon/damos_quota_goal.py +++ b/tools/testing/selftests/damon/damos_quota_goal.py @@ -63,6 +63,9 @@ def main(): if last_effective_bytes != 0 else -1.0)) if last_effective_bytes == goal.effective_bytes: + # effective quota was already minimum that cannot be more reduced + if expect_increase is False and last_effective_bytes == 1: + continue print('efective bytes not changed: %d' % goal.effective_bytes) exit(1) diff --git a/tools/testing/selftests/drivers/net/hds.py b/tools/testing/selftests/drivers/net/hds.py index 394971b25c0b1..873f5219e41d7 100755 --- a/tools/testing/selftests/drivers/net/hds.py +++ b/tools/testing/selftests/drivers/net/hds.py @@ -2,17 +2,54 @@ # SPDX-License-Identifier: GPL-2.0 import errno +import os from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_raises, KsftSkipEx -from lib.py import EthtoolFamily, NlError +from lib.py import CmdExitFailure, EthtoolFamily, NlError from lib.py import NetDrvEnv +from lib.py import defer, ethtool, ip -def get_hds(cfg, netnl) -> None: + +def _get_hds_mode(cfg, netnl) -> str: try: rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) except NlError as e: raise KsftSkipEx('ring-get not supported by device') if 'tcp-data-split' not in rings: raise KsftSkipEx('tcp-data-split not supported by device') + return rings['tcp-data-split'] + + +def _xdp_onoff(cfg): + test_dir = os.path.dirname(os.path.realpath(__file__)) + prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" + ip("link set dev %s xdp obj %s sec xdp" % + (cfg.ifname, prog)) + ip("link set dev %s xdp off" % cfg.ifname) + + +def _ioctl_ringparam_modify(cfg, netnl) -> None: + """ + Helper for performing a hopefully unimportant IOCTL SET. + IOCTL does not support HDS, so it should not affect the HDS config. + """ + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + + if 'tx' not in rings: + raise KsftSkipEx('setting Tx ring size not supported') + + try: + ethtool(f"--disable-netlink -G {cfg.ifname} tx {rings['tx'] // 2}") + except CmdExitFailure as e: + ethtool(f"--disable-netlink -G {cfg.ifname} tx {rings['tx'] * 2}") + defer(ethtool, f"-G {cfg.ifname} tx {rings['tx']}") + + +def get_hds(cfg, netnl) -> None: + _get_hds_mode(cfg, netnl) + def get_hds_thresh(cfg, netnl) -> None: try: @@ -104,6 +141,103 @@ def set_hds_thresh_gt(cfg, netnl) -> None: netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': hds_gt}) ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + +def set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "auto" / UNKNOWN mode, XDP installation should work. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _xdp_onoff(cfg) + + +def enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + +def set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "auto" / UNKNOWN mode, XDP installation should work. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _xdp_onoff(cfg) + + +def enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) # Trigger skip if not supported + + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + +def ioctl(cfg, netnl) -> None: + mode1 = _get_hds_mode(cfg, netnl) + _ioctl_ringparam_modify(cfg, netnl) + mode2 = _get_hds_mode(cfg, netnl) + + ksft_eq(mode1, mode2) + + +def ioctl_set_xdp(cfg, netnl) -> None: + """ + Like set_xdp(), but we perturb the settings via the legacy ioctl. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _ioctl_ringparam_modify(cfg, netnl) + + _xdp_onoff(cfg) + + +def ioctl_enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) # Trigger skip if not supported + + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + def main() -> None: with NetDrvEnv(__file__, queue_count=3) as cfg: ksft_run([get_hds, @@ -112,7 +246,12 @@ def main() -> None: set_hds_enable, set_hds_thresh_zero, set_hds_thresh_max, - set_hds_thresh_gt], + set_hds_thresh_gt, + set_xdp, + enabled_set_xdp, + ioctl, + ioctl_set_xdp, + ioctl_enabled_set_xdp], args=(cfg, EthtoolFamily())) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 38303da957ee5..8a518905a9f9c 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -45,10 +45,9 @@ def addremove_queues(cfg, nl) -> None: netnl = EthtoolFamily() channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) - if channels['combined-count'] == 0: - rx_type = 'rx' - else: - rx_type = 'combined' + rx_type = 'rx' + if channels.get('combined-count', 0) > 0: + rx_type = 'combined' expected = curr_queues - 1 cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc index dc25bcf4f9e2c..73f6c6fcecabe 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc @@ -7,12 +7,42 @@ echo 0 > events/enable echo > dynamic_events PLACE=$FUNCTION_FORK +PLACE2="kmem_cache_free" +PLACE3="schedule_timeout" + +# Some functions may have BPF programs attached, therefore +# count already enabled_functions before tests start +ocnt=`cat enabled_functions | wc -l` echo "f:myevent1 $PLACE" >> dynamic_events + +# Make sure the event is attached and is the only one +grep -q $PLACE enabled_functions +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $((ocnt + 1)) ]; then + exit_fail +fi + echo "f:myevent2 $PLACE%return" >> dynamic_events +# It should till be the only attached function +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $((ocnt + 1)) ]; then + exit_fail +fi + +# add another event +echo "f:myevent3 $PLACE2" >> dynamic_events + +grep -q $PLACE2 enabled_functions +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $((ocnt + 2)) ]; then + exit_fail +fi + grep -q myevent1 dynamic_events grep -q myevent2 dynamic_events +grep -q myevent3 dynamic_events test -d events/fprobes/myevent1 test -d events/fprobes/myevent2 @@ -21,6 +51,34 @@ echo "-:myevent2" >> dynamic_events grep -q myevent1 dynamic_events ! grep -q myevent2 dynamic_events +# should still have 2 left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $((ocnt + 2)) ]; then + exit_fail +fi + echo > dynamic_events +# Should have none left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $ocnt ]; then + exit_fail +fi + +echo "f:myevent4 $PLACE" >> dynamic_events + +# Should only have one enabled +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $((ocnt + 1)) ]; then + exit_fail +fi + +echo > dynamic_events + +# Should have none left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne $ocnt ]; then + exit_fail +fi + clear_trace diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile index 0336353bd15f0..2839d2612ce3a 100644 --- a/tools/testing/selftests/hid/Makefile +++ b/tools/testing/selftests/hid/Makefile @@ -43,10 +43,8 @@ TEST_GEN_PROGS = hid_bpf hidraw # $3 - target (assumed to be file); only file name will be emitted; # $4 - optional extra arg, emitted as-is, if provided. ifeq ($(V),1) -Q = msg = else -Q = @ msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index d9c76b4c0d88a..6a437d2be9fa4 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -18,6 +18,7 @@ #include "ucall_common.h" static bool mprotect_ro_done; +static bool all_vcpus_hit_ro_fault; static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) { @@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) /* * Write to the region while mprotect(PROT_READ) is underway. Keep - * looping until the memory is guaranteed to be read-only, otherwise - * vCPUs may complete their writes and advance to the next stage - * prematurely. + * looping until the memory is guaranteed to be read-only and a fault + * has occurred, otherwise vCPUs may complete their writes and advance + * to the next stage prematurely. * * For architectures that support skipping the faulting instruction, * generate the store via inline assembly to ensure the exact length @@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) #else vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); #endif - } while (!READ_ONCE(mprotect_ro_done)); + } while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault)); /* * Only architectures that write the entire range can explicitly sync, @@ -81,6 +82,7 @@ struct vcpu_info { static int nr_vcpus; static atomic_t rendezvous; +static atomic_t nr_ro_faults; static void rendezvous_with_boss(void) { @@ -148,12 +150,16 @@ static void *vcpu_worker(void *data) * be stuck on the faulting instruction for other architectures. Go to * stage 3 without a rendezvous */ - do { - r = _vcpu_run(vcpu); - } while (!r); + r = _vcpu_run(vcpu); TEST_ASSERT(r == -1 && errno == EFAULT, "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno); + atomic_inc(&nr_ro_faults); + if (atomic_read(&nr_ro_faults) == nr_vcpus) { + WRITE_ONCE(all_vcpus_hit_ro_fault, true); + sync_global_to_guest(vm, all_vcpus_hit_ro_fault); + } + #if defined(__x86_64__) || defined(__aarch64__) /* * Verify *all* writes from the guest hit EFAULT due to the VMA now @@ -378,7 +384,6 @@ int main(int argc, char *argv[]) rendezvous_with_vcpus(&time_run2, "run 2"); mprotect(mem, slot_size, PROT_READ); - usleep(10); mprotect_ro_done = true; sync_global_to_guest(vm, mprotect_ro_done); diff --git a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c index 3eb0313ffa397..3641a42934acb 100644 --- a/tools/testing/selftests/kvm/x86/nested_exceptions_test.c +++ b/tools/testing/selftests/kvm/x86/nested_exceptions_test.c @@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector, GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); + GUEST_ASSERT(!ctrl->int_state); } static void l1_svm_code(struct svm_test_data *svm) @@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code) GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); + GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO)); } static void l1_vmx_code(struct vmx_pages *vmx) diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index a1a688e752666..d97816dc476a2 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest) bool bad = false; for (i = 0; i < 4095; i++) { if (from_host[i] != from_guest[i]) { - printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + printf("mismatch at %u | %02hhx %02hhx\n", + i, from_host[i], from_guest[i]); bad = true; } } diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore index 470203a7cd737..335b2b1a3463a 100644 --- a/tools/testing/selftests/landlock/.gitignore +++ b/tools/testing/selftests/landlock/.gitignore @@ -1,2 +1,4 @@ /*_test +/sandbox-and-launch /true +/wait-pipe diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index a604ea5d8297c..6064c9ac05329 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -207,6 +207,7 @@ enforce_ruleset(struct __test_metadata *const _metadata, const int ruleset_fd) struct protocol_variant { int domain; int type; + int protocol; }; struct service_fixture { diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 29af19c4e9f98..425de4c20271c 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -1,8 +1,11 @@ +CONFIG_AF_UNIX_OOB=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_INET=y CONFIG_IPV6=y CONFIG_KEYS=y +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y CONFIG_NET=y CONFIG_NET_NS=y CONFIG_OVERLAY_FS=y diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 4e0aeb53b225a..d9de0ee49ebc2 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -85,18 +85,18 @@ static void setup_loopback(struct __test_metadata *const _metadata) clear_ambient_cap(_metadata, CAP_NET_ADMIN); } +static bool prot_is_tcp(const struct protocol_variant *const prot) +{ + return (prot->domain == AF_INET || prot->domain == AF_INET6) && + prot->type == SOCK_STREAM && + (prot->protocol == IPPROTO_TCP || prot->protocol == IPPROTO_IP); +} + static bool is_restricted(const struct protocol_variant *const prot, const enum sandbox_type sandbox) { - switch (prot->domain) { - case AF_INET: - case AF_INET6: - switch (prot->type) { - case SOCK_STREAM: - return sandbox == TCP_SANDBOX; - } - break; - } + if (sandbox == TCP_SANDBOX) + return prot_is_tcp(prot); return false; } @@ -105,7 +105,7 @@ static int socket_variant(const struct service_fixture *const srv) int ret; ret = socket(srv->protocol.domain, srv->protocol.type | SOCK_CLOEXEC, - 0); + srv->protocol.protocol); if (ret < 0) return -errno; return ret; @@ -290,22 +290,70 @@ FIXTURE_TEARDOWN(protocol) } /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp) { +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp1) { /* clang-format on */ .sandbox = NO_SANDBOX, .prot = { .domain = AF_INET, .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, }, }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp) { +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp2) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_mptcp) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp1) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp2) { /* clang-format on */ .sandbox = NO_SANDBOX, .prot = { .domain = AF_INET6, .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_mptcp) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, }, }; @@ -350,22 +398,70 @@ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_unix_datagram) { }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp) { +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp1) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp2) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_mptcp) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { .domain = AF_INET, .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp1) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp2) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, }, }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp) { +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_mptcp) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { .domain = AF_INET6, .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, }, }; diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index d6edcfcb5be83..5303900339292 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -228,4 +228,7 @@ $(OUTPUT)/%:%.S $(LINK.S) $^ $(LDLIBS) -o $@ endif -.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir +headers: + $(Q)$(MAKE) -C $(top_srcdir) headers + +.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir headers diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c index ada9156cc497b..c463d1c09c9b4 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -15,7 +15,7 @@ #define _GNU_SOURCE #include #include -#include +#include #include #include #include /* Definition of O_* constants */ diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 66b4e111b5a27..b61803e36d1cf 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -369,6 +369,7 @@ static void test_unmerge_discarded(void) munmap(map, size); } +#ifdef __NR_userfaultfd static void test_unmerge_uffd_wp(void) { struct uffdio_writeprotect uffd_writeprotect; @@ -429,6 +430,7 @@ static void test_unmerge_uffd_wp(void) unmap: munmap(map, size); } +#endif /* Verify that KSM can be enabled / queried with prctl. */ static void test_prctl(void) @@ -684,7 +686,9 @@ int main(int argc, char **argv) exit(test_child_ksm()); } +#ifdef __NR_userfaultfd tests++; +#endif ksft_print_header(); ksft_set_plan(tests); @@ -696,7 +700,9 @@ int main(int argc, char **argv) test_unmerge(); test_unmerge_zero_pages(); test_unmerge_discarded(); +#ifdef __NR_userfaultfd test_unmerge_uffd_wp(); +#endif test_prot_none(); diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 74c911aa3aea9..9a0597310a765 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -28,6 +28,8 @@ #define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) #define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) +#ifdef __NR_memfd_secret + #define PATTERN 0x55 static const int prot = PROT_READ | PROT_WRITE; @@ -332,3 +334,13 @@ int main(int argc, char *argv[]) ksft_finished(); } + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index af2fce496912b..09feeb4536460 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -9,7 +9,7 @@ */ #include #include -#include +#include #include #include #include @@ -265,6 +265,7 @@ static void test_pte_mapped_thp(void) munmap(mmap_mem, mmap_size); } +#ifdef __NR_userfaultfd static void test_uffdio_copy(void) { struct uffdio_register uffdio_register; @@ -322,6 +323,7 @@ static void test_uffdio_copy(void) munmap(dst, pagesize); free(src); } +#endif /* __NR_userfaultfd */ int main(void) { @@ -334,7 +336,9 @@ int main(void) thpsize / 1024); tests += 3; } +#ifdef __NR_userfaultfd tests += 1; +#endif /* __NR_userfaultfd */ ksft_print_header(); ksft_set_plan(tests); @@ -364,7 +368,9 @@ int main(void) if (thpsize) test_pte_mapped_thp(); /* Placing a fresh page via userfaultfd may set the PTE dirty. */ +#ifdef __NR_userfaultfd test_uffdio_copy(); +#endif /* __NR_userfaultfd */ err = ksft_get_fail_cnt(); if (err) diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h index 1e5731bab499a..4417eaa5cfb78 100644 --- a/tools/testing/selftests/mm/mlock2.h +++ b/tools/testing/selftests/mm/mlock2.h @@ -3,7 +3,6 @@ #include #include #include -#include static int mlock2_(void *start, size_t len, int flags) { diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index a4683f2476f27..35565af308af6 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 717539eddf987..7ad6ba660c7d6 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -673,7 +673,11 @@ int uffd_open_dev(unsigned int flags) int uffd_open_sys(unsigned int flags) { +#ifdef __NR_userfaultfd return syscall(__NR_userfaultfd, flags); +#else + return -1; +#endif } int uffd_open(unsigned int flags) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index a4b83280998ab..944d559ade21f 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -33,10 +33,11 @@ * pthread_mutex_lock will also verify the atomicity of the memory * transfer (UFFDIO_COPY). */ -#include + #include "uffd-common.h" uint64_t features; +#ifdef __NR_userfaultfd #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) @@ -471,3 +472,15 @@ int main(int argc, char **argv) nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); } + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 9ff71fa1f9bf0..74c8bc02b5063 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -5,11 +5,12 @@ * Copyright (C) 2015-2023 Red Hat, Inc. */ -#include #include "uffd-common.h" #include "../../../../mm/gup_test.h" +#ifdef __NR_userfaultfd + /* The unit test doesn't need a large or random size, make it 32MB for now */ #define UFFD_TEST_MEM_SIZE (32UL << 20) @@ -1558,3 +1559,14 @@ int main(int argc, char *argv[]) return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; } +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("Skipping %s (missing __NR_userfaultfd)\n", __file__); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index bc6b6762baf3e..c22623b9a2a5f 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -9,7 +9,10 @@ TEST_FILES := ../../../../../Documentation/netlink/specs TEST_FILES += ../../../../net/ynl TEST_GEN_FILES += csum +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) TEST_INCLUDES := $(wildcard py/*.py sh/*.sh) include ../../lib.mk + +include ../bpf.mk diff --git a/tools/testing/selftests/net/lib/xdp_dummy.bpf.c b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c new file mode 100644 index 0000000000000..d988b2e0cee84 --- /dev/null +++ b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define KBUILD_MODNAME "xdp_dummy" +#include +#include + +SEC("xdp") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 16496de5f6ce4..0fda241fa62b0 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -9,3 +9,4 @@ param_test_compare_twice param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice +syscall_errors_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 5a3432fceb586..0d0a5fae59547 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -16,11 +16,12 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice + param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ + syscall_errors_test TEST_GEN_PROGS_EXTENDED = librseq.so -TEST_PROGS = run_param_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh TEST_FILES := settings @@ -54,3 +55,7 @@ $(OUTPUT)/param_test_mm_cid_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ + rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq-riscv-bits.h b/tools/testing/selftests/rseq/rseq-riscv-bits.h index de31a0143139b..f02f411d550d1 100644 --- a/tools/testing/selftests/rseq/rseq-riscv-bits.h +++ b/tools/testing/selftests/rseq/rseq-riscv-bits.h @@ -243,7 +243,7 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i #ifdef RSEQ_COMPARE_TWICE RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]") #endif - RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, 3) + RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, 3) RSEQ_INJECT_ASM(4) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ @@ -251,8 +251,8 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i [current_cpu_id] "m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD), [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [ptr] "r" (ptr), - [off] "er" (off), - [inc] "er" (inc) + [off] "r" (off), + [inc] "r" (inc) RSEQ_INJECT_INPUT : "memory", RSEQ_ASM_TMP_REG_1 RSEQ_INJECT_CLOBBER diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index 37e598d0a365e..67d544aaa9a3b 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -158,7 +158,7 @@ do { \ "bnez " RSEQ_ASM_TMP_REG_1 ", 222b\n" \ "333:\n" -#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, post_commit_label) \ +#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, post_commit_label) \ "mv " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(ptr) "]\n" \ RSEQ_ASM_OP_R_ADD(off) \ REG_L RSEQ_ASM_TMP_REG_1 ", 0(" RSEQ_ASM_TMP_REG_1 ")\n" \ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index f6156790c3b4d..1e29db92094d8 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -87,7 +87,7 @@ static int sys_getcpu(unsigned *cpu, unsigned *node) return syscall(__NR_getcpu, cpu, node, NULL); } -int rseq_available(void) +bool rseq_available(void) { int rc; @@ -96,9 +96,9 @@ int rseq_available(void) abort(); switch (errno) { case ENOSYS: - return 0; + return false; case EINVAL: - return 1; + return true; default: abort(); } diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index ba424ce80a719..f51a5fdb04443 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -159,6 +159,11 @@ int32_t rseq_fallback_current_cpu(void); */ int32_t rseq_fallback_current_node(void); +/* + * Returns true if rseq is supported. + */ +bool rseq_available(void); + /* * Values returned can be either the current CPU number, -1 (rseq is * uninitialized), or -2 (rseq initialization has failed). diff --git a/tools/testing/selftests/rseq/run_syscall_errors_test.sh b/tools/testing/selftests/rseq/run_syscall_errors_test.sh new file mode 100755 index 0000000000000..9272246b39f24 --- /dev/null +++ b/tools/testing/selftests/rseq/run_syscall_errors_test.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Michael Jeanson + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./syscall_errors_test diff --git a/tools/testing/selftests/rseq/syscall_errors_test.c b/tools/testing/selftests/rseq/syscall_errors_test.c new file mode 100644 index 0000000000000..a5d9e1f8a2dc2 --- /dev/null +++ b/tools/testing/selftests/rseq/syscall_errors_test.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Michael Jeanson + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include + +#include "rseq.h" + +static int sys_rseq(void *rseq_abi, uint32_t rseq_len, + int flags, uint32_t sig) +{ + return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); +} + +/* + * Check the value of errno on some expected failures of the rseq syscall. + */ + +int main(void) +{ + struct rseq_abi *global_rseq = rseq_get_abi(); + int ret; + int errno_copy; + + if (!rseq_available()) { + fprintf(stderr, "rseq syscall unavailable"); + goto error; + } + + /* The current thread is NOT registered. */ + + /* EINVAL */ + errno = 0; + ret = sys_rseq(global_rseq, 32, -1, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid flag fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq((char *) global_rseq + 1, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with unaligned rseq_abi fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 31, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid size fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + +#if defined(__LP64__) && (!defined(__s390__) && !defined(__s390x__)) + /* + * We haven't found a reliable way to find an invalid address when + * running a 32bit userspace on a 64bit kernel, so only run this test + * on 64bit builds for the moment. + * + * Also exclude architectures that select + * CONFIG_ALTERNATE_USER_ADDRESS_SPACE where the kernel and userspace + * have their own address space and this failure can't happen. + */ + + /* EFAULT */ + errno = 0; + ret = sys_rseq((void *) -4096UL, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid address fails with errno set to EFAULT (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EFAULT) + goto error; +#endif + + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0 && errno != 0) + goto error; + + /* The current thread is registered. */ + + /* EBUSY */ + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double registration fails with errno set to EBUSY (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EBUSY) + goto error; + + /* EPERM */ + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG + 1); + errno_copy = errno; + fprintf(stderr, "Unregistration with wrong RSEQ_SIG fails with errno to EPERM (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EPERM) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Unregistration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double unregistration fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + return 0; +error: + return -1; +} diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 14ba51b52095a..ad4afac59cc54 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -4893,7 +4893,25 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ +__attribute__((naked)) noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4946,35 +4964,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4984,12 +5013,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5000,7 +5034,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5014,11 +5048,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5031,7 +5071,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5048,11 +5088,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5067,11 +5110,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 1cf14a8da4380..12a0614b9fd49 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -19,13 +19,20 @@ LDLIBS += -lgcc_s endif include ../lib.mk + +CFLAGS += $(TOOLS_INCLUDES) + +CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tables \ + -fno-stack-protector -include $(top_srcdir)/tools/include/nolibc/nolibc.h \ + -I$(top_srcdir)/tools/include/nolibc/ $(KHDR_INCLUDES) + $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c $(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c -$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c -$(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector +$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers +$(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 2fe5e983cb22f..3ff00fb624a44 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -19,13 +19,14 @@ #include #include #include -#include +#include +#include #include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS -# if ULONG_MAX > 0xffffffffUL +# if __SIZEOF_LONG__ >= 8 # define ELF_BITS 64 # else # define ELF_BITS 32 @@ -53,7 +54,7 @@ static struct vdso_info /* Symbol table */ ELF(Sym) *symtab; const char *symstrings; - ELF(Word) *gnu_hash; + ELF(Word) *gnu_hash, *gnu_bucket; ELF_HASH_ENTRY *bucket, *chain; ELF_HASH_ENTRY nbucket, nchain; @@ -185,8 +186,8 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) /* The bucket array is located after the header (4 uint32) and the bloom * filter (size_t array of gnu_hash[2] elements). */ - vdso_info.bucket = vdso_info.gnu_hash + 4 + - sizeof(size_t) / 4 * vdso_info.gnu_hash[2]; + vdso_info.gnu_bucket = vdso_info.gnu_hash + 4 + + sizeof(size_t) / 4 * vdso_info.gnu_hash[2]; } else { vdso_info.nbucket = hash[0]; vdso_info.nchain = hash[1]; @@ -268,11 +269,11 @@ void *vdso_sym(const char *version, const char *name) if (vdso_info.gnu_hash) { uint32_t h1 = gnu_hash(name), h2, *hashval; - i = vdso_info.bucket[h1 % vdso_info.nbucket]; + i = vdso_info.gnu_bucket[h1 % vdso_info.nbucket]; if (i == 0) return 0; h1 |= 1; - hashval = vdso_info.bucket + vdso_info.nbucket + + hashval = vdso_info.gnu_bucket + vdso_info.nbucket + (i - vdso_info.gnu_hash[1]); for (;; i++) { ELF(Sym) *sym = &vdso_info.symtab[i]; @@ -297,17 +298,3 @@ void *vdso_sym(const char *version, const char *name) return 0; } - -void vdso_init_from_auxv(void *auxv) -{ - ELF(auxv_t) *elf_auxv = auxv; - for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) - { - if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { - vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); - return; - } - } - - vdso_info.valid = false; -} diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h index de0453067d7cd..09d068ed11f97 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.h +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -26,6 +26,5 @@ */ void *vdso_sym(const char *version, const char *name); void vdso_init_from_sysinfo_ehdr(uintptr_t base); -void vdso_init_from_auxv(void *auxv); #endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 644915862af88..9ce795b806f09 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -1,142 +1,58 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c on x86 - * Copyright (c) 2011-2014 Andy Lutomirski + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() + * Copyright (c) 2014 Andy Lutomirski * - * You can amuse yourself by compiling with: - * gcc -std=gnu99 -nostdlib - * -Os -fno-asynchronous-unwind-tables -flto -lgcc_s - * vdso_standalone_test_x86.c parse_vdso.c - * to generate a small binary. On x86_64, you can omit -lgcc_s - * if you want the binary to be completely standalone. + * Compile with: + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c + * + * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include +#include +#ifndef NOLIBC +#include #include -#include -#include - -#include "parse_vdso.h" - -/* We need some libc functions... */ -int strcmp(const char *a, const char *b) -{ - /* This implementation is buggy: it never returns -1. */ - while (*a || *b) { - if (*a != *b) - return 1; - if (*a == 0 || *b == 0) - return 1; - a++; - b++; - } - - return 0; -} - -/* - * The clang build needs this, although gcc does not. - * Stolen from lib/string.c. - */ -void *memcpy(void *dest, const void *src, size_t count) -{ - char *tmp = dest; - const char *s = src; - - while (count--) - *tmp++ = *s++; - return dest; -} - -/* ...and two syscalls. This is x86-specific. */ -static inline long x86_syscall3(long nr, long a0, long a1, long a2) -{ - long ret; -#ifdef __x86_64__ - asm volatile ("syscall" : "=a" (ret) : "a" (nr), - "D" (a0), "S" (a1), "d" (a2) : - "cc", "memory", "rcx", - "r8", "r9", "r10", "r11" ); -#else - asm volatile ("int $0x80" : "=a" (ret) : "a" (nr), - "b" (a0), "c" (a1), "d" (a2) : - "cc", "memory" ); #endif - return ret; -} -static inline long linux_write(int fd, const void *data, size_t len) -{ - return x86_syscall3(__NR_write, fd, (long)data, (long)len); -} +#include "../kselftest.h" +#include "parse_vdso.h" +#include "vdso_config.h" +#include "vdso_call.h" -static inline void linux_exit(int code) +int main(int argc, char **argv) { - x86_syscall3(__NR_exit, code, 0, 0); -} + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; -void to_base10(char *lastdig, time_t n) -{ - while (n) { - *lastdig = (n % 10) + '0'; - n /= 10; - lastdig--; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; } -} - -void c_main(void **stack) -{ - /* Parse the stack */ - long argc = (long)*stack; - stack += argc + 2; - - /* Now we're pointing at the environment. Skip it. */ - while(*stack) - stack++; - stack++; - /* Now we're pointing at auxv. Initialize the vDSO parser. */ - vdso_init_from_auxv((void *)stack); + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); - if (!gtod) - linux_exit(1); + if (!gtod) { + printf("Could not find %s\n", name[0]); + return KSFT_SKIP; + } struct timeval tv; - long ret = gtod(&tv, 0); + long ret = VDSO_CALL(gtod, 2, &tv, 0); if (ret == 0) { - char buf[] = "The time is .000000\n"; - to_base10(buf + 31, tv.tv_sec); - to_base10(buf + 38, tv.tv_usec); - linux_write(1, buf, sizeof(buf) - 1); + printf("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - linux_exit(ret); + printf("%s failed\n", name[0]); + return KSFT_FAIL; } - linux_exit(0); + return 0; } - -/* - * This is the real entry point. It passes the initial stack into - * the C entry point. - */ -asm ( - ".text\n" - ".global _start\n" - ".type _start,@function\n" - "_start:\n\t" -#ifdef __x86_64__ - "mov %rsp,%rdi\n\t" - "and $-16,%rsp\n\t" - "sub $8,%rsp\n\t" - "jmp c_main" -#else - "push %esp\n\t" - "call c_main\n\t" - "int $3" -#endif - ); diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e31b18ffae338..9ce795b806f09 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -10,11 +10,11 @@ * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include -#include #include +#ifndef NOLIBC #include #include +#endif #include "../kselftest.h" #include "parse_vdso.h" diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d51249f14e2fe..28422c32cc8ff 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -19,7 +19,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx lam test_shadow_stack + corrupt_xstate_header amx lam test_shadow_stack avx # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -132,3 +132,7 @@ $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack $(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack + +$(OUTPUT)/avx_64: CFLAGS += -mno-avx -mno-avx512f +$(OUTPUT)/amx_64: EXTRA_FILES += xstate.c +$(OUTPUT)/avx_64: EXTRA_FILES += xstate.c diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index 1fdf35a4d7f63..40769c16de1bb 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -3,7 +3,6 @@ #define _GNU_SOURCE #include #include -#include #include #include #include @@ -14,169 +13,27 @@ #include #include #include -#include #include #include -#include -#include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" +#include "xstate.h" #ifndef __x86_64__ # error This test is 64-bit only #endif -#define XSAVE_HDR_OFFSET 512 -#define XSAVE_HDR_SIZE 64 - -struct xsave_buffer { - union { - struct { - char legacy[XSAVE_HDR_OFFSET]; - char header[XSAVE_HDR_SIZE]; - char extended[0]; - }; - char bytes[0]; - }; -}; - -static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xsave (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) - : "memory"); -} - -static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xrstor (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); -} - /* err() exits and will not return */ #define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__) -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 #define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) #define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) #define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -#define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) -#define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) - -static uint32_t xbuf_size; - -static struct { - uint32_t xbuf_offset; - uint32_t size; -} xtiledata; - -#define CPUID_LEAF_XSTATE 0xd -#define CPUID_SUBLEAF_XSTATE_USER 0x0 -#define TILE_CPUID 0x1d -#define TILE_PALETTE_ID 0x1 - -static void check_cpuid_xtiledata(void) -{ - uint32_t eax, ebx, ecx, edx; - - __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, - eax, ebx, ecx, edx); - - /* - * EBX enumerates the size (in bytes) required by the XSAVE - * instruction for an XSAVE area containing all the user state - * components corresponding to bits currently set in XCR0. - * - * Stash that off so it can be used to allocate buffers later. - */ - xbuf_size = ebx; - - __cpuid_count(CPUID_LEAF_XSTATE, XFEATURE_XTILEDATA, - eax, ebx, ecx, edx); - /* - * eax: XTILEDATA state component size - * ebx: XTILEDATA state component offset in user buffer - */ - if (!eax || !ebx) - fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", - eax, ebx); - - xtiledata.size = eax; - xtiledata.xbuf_offset = ebx; -} +struct xstate_info xtiledata; /* The helpers for managing XSAVE buffer and tile states: */ -struct xsave_buffer *alloc_xbuf(void) -{ - struct xsave_buffer *xbuf; - - /* XSAVE buffer should be 64B-aligned. */ - xbuf = aligned_alloc(64, xbuf_size); - if (!xbuf) - fatal_error("aligned_alloc()"); - return xbuf; -} - -static inline void clear_xstate_header(struct xsave_buffer *buffer) -{ - memset(&buffer->header, 0, sizeof(buffer->header)); -} - -static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv) -{ - /* XSTATE_BV is at the beginning of the header: */ - *(uint64_t *)(&buffer->header) = bv; -} - -static void set_rand_tiledata(struct xsave_buffer *xbuf) -{ - int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset]; - int data; - int i; - - /* - * Ensure that 'data' is never 0. This ensures that - * the registers are never in their initial configuration - * and thus never tracked as being in the init state. - */ - data = rand() | 1; - - for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++) - *ptr = data; -} - struct xsave_buffer *stashed_xsave; static void init_stashed_xsave(void) @@ -192,21 +49,6 @@ static void free_stashed_xsave(void) free(stashed_xsave); } -/* See 'struct _fpx_sw_bytes' at sigcontext.h */ -#define SW_BYTES_OFFSET 464 -/* N.B. The struct's field name varies so read from the offset. */ -#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) - -static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer) -{ - return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET); -} - -static inline uint64_t get_fpx_sw_bytes_features(void *buffer) -{ - return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); -} - /* Work around printf() being unsafe in signals: */ #define SIGNAL_BUF_LEN 1000 char signal_message_buffer[SIGNAL_BUF_LEN]; @@ -304,17 +146,10 @@ static inline bool load_rand_tiledata(struct xsave_buffer *xbuf) { clear_xstate_header(xbuf); set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA); - set_rand_tiledata(xbuf); + set_rand_data(&xtiledata, xbuf); return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA); } -/* Return XTILEDATA to its initial configuration. */ -static inline void init_xtiledata(void) -{ - clear_xstate_header(stashed_xsave); - xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA); -} - enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; /* arch_prctl() and sigaltstack() test */ @@ -587,14 +422,6 @@ static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1) return true; } -static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf) -{ - int ret = __validate_tiledata_regs(xbuf); - - if (ret != 0) - fatal_error("TILEDATA registers changed"); -} - static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf) { int ret = __validate_tiledata_regs(xbuf); @@ -651,251 +478,6 @@ static void test_fork(void) _exit(0); } -/* Context switching test */ - -static struct _ctxtswtest_cfg { - unsigned int iterations; - unsigned int num_threads; -} ctxtswtest_config; - -struct futex_info { - pthread_t thread; - int nr; - pthread_mutex_t mutex; - struct futex_info *next; -}; - -static void *check_tiledata(void *info) -{ - struct futex_info *finfo = (struct futex_info *)info; - struct xsave_buffer *xbuf; - int i; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - /* - * Load random data into 'xbuf' and then restore - * it to the tile registers themselves. - */ - load_rand_tiledata(xbuf); - for (i = 0; i < ctxtswtest_config.iterations; i++) { - pthread_mutex_lock(&finfo->mutex); - - /* - * Ensure the register values have not - * diverged from those recorded in 'xbuf'. - */ - validate_tiledata_regs_same(xbuf); - - /* Load new, random values into xbuf and registers */ - load_rand_tiledata(xbuf); - - /* - * The last thread's last unlock will be for - * thread 0's mutex. However, thread 0 will - * have already exited the loop and the mutex - * will already be unlocked. - * - * Because this is not an ERRORCHECK mutex, - * that inconsistency will be silently ignored. - */ - pthread_mutex_unlock(&finfo->next->mutex); - } - - free(xbuf); - /* - * Return this thread's finfo, which is - * a unique value for this thread. - */ - return finfo; -} - -static int create_threads(int num, struct futex_info *finfo) -{ - int i; - - for (i = 0; i < num; i++) { - int next_nr; - - finfo[i].nr = i; - /* - * Thread 'i' will wait on this mutex to - * be unlocked. Lock it immediately after - * initialization: - */ - pthread_mutex_init(&finfo[i].mutex, NULL); - pthread_mutex_lock(&finfo[i].mutex); - - next_nr = (i + 1) % num; - finfo[i].next = &finfo[next_nr]; - - if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i])) - fatal_error("pthread_create()"); - } - return 0; -} - -static void affinitize_cpu0(void) -{ - cpu_set_t cpuset; - - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - - if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) - fatal_error("sched_setaffinity to CPU 0"); -} - -static void test_context_switch(void) -{ - struct futex_info *finfo; - int i; - - /* Affinitize to one CPU to force context switches */ - affinitize_cpu0(); - - req_xtiledata_perm(); - - printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n", - ctxtswtest_config.iterations, - ctxtswtest_config.num_threads); - - - finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads); - if (!finfo) - fatal_error("malloc()"); - - create_threads(ctxtswtest_config.num_threads, finfo); - - /* - * This thread wakes up thread 0 - * Thread 0 will wake up 1 - * Thread 1 will wake up 2 - * ... - * the last thread will wake up 0 - * - * ... this will repeat for the configured - * number of iterations. - */ - pthread_mutex_unlock(&finfo[0].mutex); - - /* Wait for all the threads to finish: */ - for (i = 0; i < ctxtswtest_config.num_threads; i++) { - void *thread_retval; - int rc; - - rc = pthread_join(finfo[i].thread, &thread_retval); - - if (rc) - fatal_error("pthread_join() failed for thread %d err: %d\n", - i, rc); - - if (thread_retval != &finfo[i]) - fatal_error("unexpected thread retval for thread %d: %p\n", - i, thread_retval); - - } - - printf("[OK]\tNo incorrect case was found.\n"); - - free(finfo); -} - -/* Ptrace test */ - -/* - * Make sure the ptracee has the expanded kernel buffer on the first - * use. Then, initialize the state before performing the state - * injection from the ptracer. - */ -static inline void ptracee_firstuse_tiledata(void) -{ - load_rand_tiledata(stashed_xsave); - init_xtiledata(); -} - -/* - * Ptracer injects the randomized tile data state. It also reads - * before and after that, which will execute the kernel's state copy - * functions. So, the tester is advised to double-check any emitted - * kernel messages. - */ -static void ptracer_inject_tiledata(pid_t target) -{ - struct xsave_buffer *xbuf; - struct iovec iov; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - printf("\tRead the init'ed tiledata via ptrace().\n"); - - iov.iov_base = xbuf; - iov.iov_len = xbuf_size; - - memset(stashed_xsave, 0, xbuf_size); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tThe init'ed tiledata was read from ptracee.\n"); - else - printf("[FAIL]\tThe init'ed tiledata was not read from ptracee.\n"); - - printf("\tInject tiledata via ptrace().\n"); - - load_rand_tiledata(xbuf); - - memcpy(&stashed_xsave->bytes[xtiledata.xbuf_offset], - &xbuf->bytes[xtiledata.xbuf_offset], - xtiledata.size); - - if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_SETREGSET"); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tTiledata was correctly written to ptracee.\n"); - else - printf("[FAIL]\tTiledata was not correctly written to ptracee.\n"); -} - -static void test_ptrace(void) -{ - pid_t child; - int status; - - child = fork(); - if (child < 0) { - err(1, "fork"); - } else if (!child) { - if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) - err(1, "PTRACE_TRACEME"); - - ptracee_firstuse_tiledata(); - - raise(SIGTRAP); - _exit(0); - } - - do { - wait(&status); - } while (WSTOPSIG(status) != SIGTRAP); - - ptracer_inject_tiledata(child); - - ptrace(PTRACE_DETACH, child, NULL, NULL); - wait(&status); - if (!WIFEXITED(status) || WEXITSTATUS(status)) - err(1, "ptrace test"); -} - int main(void) { unsigned long features; @@ -907,7 +489,11 @@ int main(void) return KSFT_SKIP; } - check_cpuid_xtiledata(); + xtiledata = get_xstate_info(XFEATURE_XTILEDATA); + if (!xtiledata.size || !xtiledata.xbuf_offset) { + fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", + xtiledata.size, xtiledata.xbuf_offset); + } init_stashed_xsave(); sethandler(SIGILL, handle_noperm, 0); @@ -919,11 +505,11 @@ int main(void) test_fork(); - ctxtswtest_config.iterations = 10; - ctxtswtest_config.num_threads = 5; - test_context_switch(); - - test_ptrace(); + /* + * Perform generic xstate tests for context switching, ptrace, + * and signal. + */ + test_xstate(XFEATURE_XTILEDATA); clearhandler(SIGILL); free_stashed_xsave(); diff --git a/tools/testing/selftests/x86/avx.c b/tools/testing/selftests/x86/avx.c new file mode 100644 index 0000000000000..11d5367c235fc --- /dev/null +++ b/tools/testing/selftests/x86/avx.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE /* Required for inline xstate helpers */ +#include "xstate.h" + +int main(void) +{ + test_xstate(XFEATURE_YMM); + test_xstate(XFEATURE_OPMASK); + test_xstate(XFEATURE_ZMM_Hi256); + test_xstate(XFEATURE_Hi16_ZMM); +} diff --git a/tools/testing/selftests/x86/corrupt_xstate_header.c b/tools/testing/selftests/x86/corrupt_xstate_header.c index cf9ce8fbb656c..93a89a5997ca8 100644 --- a/tools/testing/selftests/x86/corrupt_xstate_header.c +++ b/tools/testing/selftests/x86/corrupt_xstate_header.c @@ -18,6 +18,7 @@ #include #include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" static inline int xsave_enabled(void) { @@ -29,19 +30,6 @@ static inline int xsave_enabled(void) return ecx & (1U << 27); } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigusr1(int sig, siginfo_t *info, void *uc_void) { ucontext_t *uc = uc_void; diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index d1e919b0c1dc8..5cb8393737d05 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -24,31 +24,11 @@ #include #include +#include "helpers.h" + static unsigned long load_addr = 0x10000; static int nerrs = 0; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sig_atomic_t got_signal; static void sighandler(int sig, siginfo_t *info, void *ctx_void) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 50cf32de63139..0a75252d31b6a 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -28,6 +28,8 @@ #include #include +#include "helpers.h" + #ifndef __x86_64__ # error This test is 64-bit only #endif @@ -39,28 +41,6 @@ static unsigned short *shared_scratch; static int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigsegv(int sig, siginfo_t *si, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h index 4ef42c4559a9e..6deaad035161a 100644 --- a/tools/testing/selftests/x86/helpers.h +++ b/tools/testing/selftests/x86/helpers.h @@ -2,8 +2,13 @@ #ifndef __SELFTESTS_X86_HELPERS_H #define __SELFTESTS_X86_HELPERS_H +#include +#include + #include +#include "../kselftest.h" + static inline unsigned long get_eflags(void) { #ifdef __x86_64__ @@ -22,4 +27,27 @@ static inline void set_eflags(unsigned long eflags) #endif } +static inline void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + +static inline void clearhandler(int sig) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + #endif /* __SELFTESTS_X86_HELPERS_H */ diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c index 57ec5e99edb93..69d5fb7050c25 100644 --- a/tools/testing/selftests/x86/ioperm.c +++ b/tools/testing/selftests/x86/ioperm.c @@ -20,30 +20,9 @@ #include #include -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/iopl.c b/tools/testing/selftests/x86/iopl.c index 7e3e09c1abac6..457b6715542bd 100644 --- a/tools/testing/selftests/x86/iopl.c +++ b/tools/testing/selftests/x86/iopl.c @@ -20,30 +20,9 @@ #include #include -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 4d4a76532dc9a..18d736640ece4 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,15 @@ #define FUNC_INHERITE 0x20 #define FUNC_PASID 0x40 +/* get_user() pointer test cases */ +#define GET_USER_USER 0 +#define GET_USER_KERNEL_TOP 1 +#define GET_USER_KERNEL_BOT 2 +#define GET_USER_KERNEL 3 + #define TEST_MASK 0x7f +#define L5_SIGN_EXT_MASK (0xFFUL << 56) +#define L4_SIGN_EXT_MASK (0x1FFFFUL << 47) #define LOW_ADDR (0x1UL << 30) #define HIGH_ADDR (0x3UL << 48) @@ -115,23 +124,42 @@ static void segv_handler(int sig) siglongjmp(segv_env, 1); } -static inline int cpu_has_lam(void) +static inline int lam_is_available(void) { unsigned int cpuinfo[4]; + unsigned long bits = 0; + int ret; __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); - return (cpuinfo[0] & (1 << 26)); + /* Check if cpu supports LAM */ + if (!(cpuinfo[0] & (1 << 26))) { + ksft_print_msg("LAM is not supported!\n"); + return 0; + } + + /* Return 0 if CONFIG_ADDRESS_MASKING is not set */ + ret = syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits); + if (ret) { + ksft_print_msg("LAM is disabled in the kernel!\n"); + return 0; + } + + return 1; } -/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */ -static inline int cpu_has_la57(void) +static inline int la57_enabled(void) { - unsigned int cpuinfo[4]; + int ret; + void *p; + + p = mmap((void *)HIGH_ADDR, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); - __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + ret = p == MAP_FAILED ? 0 : 1; - return (cpuinfo[2] & (1 << 16)); + munmap(p, PAGE_SIZE); + return ret; } /* @@ -322,7 +350,7 @@ static int handle_mmap(struct testcases *test) flags, -1, 0); if (ptr == MAP_FAILED) { if (test->addr == HIGH_ADDR) - if (!cpu_has_la57()) + if (!la57_enabled()) return 3; /* unsupport LA57 */ return 1; } @@ -370,6 +398,78 @@ static int handle_syscall(struct testcases *test) return ret; } +static int get_user_syscall(struct testcases *test) +{ + uint64_t ptr_address, bitmask; + int fd, ret = 0; + void *ptr; + + if (la57_enabled()) { + bitmask = L5_SIGN_EXT_MASK; + ptr_address = HIGH_ADDR; + } else { + bitmask = L4_SIGN_EXT_MASK; + ptr_address = LOW_ADDR; + } + + ptr = mmap((void *)ptr_address, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + + if (ptr == MAP_FAILED) { + perror("failed to map byte to pass into get_user"); + return 1; + } + + if (set_lam(test->lam) != 0) { + ret = 2; + goto error; + } + + fd = memfd_create("lam_ioctl", 0); + if (fd == -1) { + munmap(ptr, PAGE_SIZE); + exit(EXIT_FAILURE); + } + + switch (test->later) { + case GET_USER_USER: + /* Control group - properly tagged user pointer */ + ptr = (void *)set_metadata((uint64_t)ptr, test->lam); + break; + case GET_USER_KERNEL_TOP: + /* Kernel address with top bit cleared */ + bitmask &= (bitmask >> 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL_BOT: + /* Kernel address with bottom sign-extension bit cleared */ + bitmask &= (bitmask << 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL: + /* Try to pass a kernel address */ + ptr = (void *)((uint64_t)ptr | bitmask); + break; + default: + printf("Invalid test case value passed!\n"); + break; + } + + /* + * Use FIOASYNC ioctl because it utilizes get_user() internally and is + * very non-invasive to the system. Pass differently tagged pointers to + * get_user() in order to verify that valid user pointers are going + * through and invalid kernel/non-canonical pointers are not. + */ + if (ioctl(fd, FIOASYNC, ptr) != 0) + ret = 1; + + close(fd); +error: + munmap(ptr, PAGE_SIZE); + return ret; +} + int sys_uring_setup(unsigned int entries, struct io_uring_params *p) { return (int)syscall(__NR_io_uring_setup, entries, p); @@ -596,8 +696,10 @@ int do_uring(unsigned long lam) fi->file_fd = file_fd; ring = malloc(sizeof(*ring)); - if (!ring) + if (!ring) { + free(fi); return 1; + } memset(ring, 0, sizeof(struct io_ring)); @@ -883,6 +985,33 @@ static struct testcases syscall_cases[] = { .test_func = handle_syscall, .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", }, + { + .later = GET_USER_USER, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER: get_user() and pass a properly tagged user pointer.\n", + }, + { + .later = GET_USER_KERNEL_TOP, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the top bit cleared.\n", + }, + { + .later = GET_USER_KERNEL_BOT, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the bottom sign-extension bit cleared.\n", + }, + { + .later = GET_USER_KERNEL, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() and pass a kernel pointer.\n", + }, }; static struct testcases mmap_cases[] = { @@ -1181,10 +1310,8 @@ int main(int argc, char **argv) tests_cnt = 0; - if (!cpu_has_lam()) { - ksft_print_msg("Unsupported LAM feature!\n"); + if (!lam_is_available()) return KSFT_SKIP; - } while ((c = getopt(argc, argv, "ht:")) != -1) { switch (c) { diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 3a29346e1452d..bb99a71380a5f 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -26,6 +26,8 @@ #include #include +#include "helpers.h" + #define AR_ACCESSED (1<<8) #define AR_TYPE_RODATA (0 * (1<<9)) @@ -506,20 +508,6 @@ static void fix_sa_restorer(int sig) } #endif -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - - fix_sa_restorer(sig); -} - static jmp_buf jmpbuf; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) @@ -549,9 +537,11 @@ static void do_multicpu_tests(void) } sethandler(SIGSEGV, sigsegv, 0); + fix_sa_restorer(SIGSEGV); #ifdef __i386__ /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ sethandler(SIGILL, sigsegv, 0); + fix_sa_restorer(SIGILL); #endif printf("[RUN]\tCross-CPU LDT invalidation\n"); diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c index cc3de6ff9fba1..f22cb6b382f9c 100644 --- a/tools/testing/selftests/x86/mov_ss_trap.c +++ b/tools/testing/selftests/x86/mov_ss_trap.c @@ -36,7 +36,7 @@ #include #include -#define X86_EFLAGS_RF (1UL << 16) +#include "helpers.h" #if __x86_64__ # define REG_IP REG_RIP @@ -94,18 +94,6 @@ static void enable_watchpoint(void) } } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static char const * const signames[] = { [SIGSEGV] = "SIGSEGV", [SIGBUS] = "SIBGUS", diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 12aaa063196e7..360ec88d5432c 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -15,6 +15,8 @@ #include #include +#include "helpers.h" + /* Bitness-agnostic defines for user_regs_struct fields. */ #ifdef __x86_64__ # define user_syscall_nr orig_rax @@ -93,18 +95,6 @@ static siginfo_t wait_trap(pid_t chld) return si; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void setsigign(int sig, int flags) { struct sigaction sa; @@ -116,16 +106,6 @@ static void setsigign(int sig, int flags) err(1, "sigaction"); } -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - #ifdef __x86_64__ # define REG_BP REG_RBP #else diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c index f689af75e979e..0ae1b784498cc 100644 --- a/tools/testing/selftests/x86/sigaltstack.c +++ b/tools/testing/selftests/x86/sigaltstack.c @@ -14,6 +14,8 @@ #include #include +#include "helpers.h" + /* sigaltstack()-enforced minimum stack */ #define ENFORCED_MINSIGSTKSZ 2048 @@ -27,30 +29,6 @@ static bool sigalrm_expected; static unsigned long at_minstack_size; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static int setup_altstack(void *start, unsigned long size) { stack_t ss; diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 0b75b29f794b7..26ef562f4232a 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -46,6 +46,8 @@ #include #include +#include "helpers.h" + /* Pull in AR_xyz defines. */ typedef unsigned int u32; typedef unsigned short u16; @@ -138,28 +140,6 @@ static unsigned short LDT3(int idx) return (idx << 3) | 7; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void add_ldt(const struct user_desc *desc, unsigned short *var, const char *name) { diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c index 9a30f443e9286..280d7a22b9c9b 100644 --- a/tools/testing/selftests/x86/single_step_syscall.c +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -33,28 +33,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t sig_traps, sig_eflags; sigjmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 48ab065a76f9b..f67a2df335ba0 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -17,18 +17,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index a108b80dd0823..f9c9814160f09 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -18,18 +18,6 @@ static unsigned int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigtrap(int sig, siginfo_t *si, void *ctx_void) { } diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 991591718bb0a..41c42b7b54a6c 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -25,6 +25,7 @@ #include #include +#include "../kselftest.h" /* Common system call numbers */ #define SYS_READ 0 @@ -313,7 +314,7 @@ static void test_syscall_numbering(void) * The MSB is supposed to be ignored, so we loop over a few * to test that out. */ - for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { + for (size_t i = 0; i < ARRAY_SIZE(msbs); i++) { int msb = msbs[i]; run("Checking system calls with msb = %d (0x%x)\n", msb, msb); diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index b30de9aaa6d41..5fb531e3ad7c3 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -22,6 +22,8 @@ #include #include +#include "helpers.h" + /* * These items are in clang_helpers_64.S, in order to avoid clang inline asm * limitations: @@ -31,28 +33,6 @@ extern const char test_page[]; static void const *current_test_page_addr = test_page; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* State used by our signal handlers. */ static gregset_t initial_regs; diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 6de11b4df4583..05e1e6774fba3 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -310,19 +310,6 @@ static void test_getcpu(int cpu) static jmp_buf jmpbuf; static volatile unsigned long segv_err; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - ksft_exit_fail_msg("sigaction failed\n"); -} - static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 4c311e1af4c7a..9cc17588d8189 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -43,18 +43,6 @@ int main() #include #include -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t nerrs; static unsigned long sysinfo; static bool got_sysinfo = false; diff --git a/tools/testing/selftests/x86/xstate.c b/tools/testing/selftests/x86/xstate.c new file mode 100644 index 0000000000000..23c1d6c964eae --- /dev/null +++ b/tools/testing/selftests/x86/xstate.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "helpers.h" +#include "xstate.h" + +/* + * The userspace xstate test suite is designed to be generic and operates + * with randomized xstate data. However, some states require special handling: + * + * - PKRU and XTILECFG need specific adjustments, such as modifying + * randomization behavior or using fixed values. + * - But, PKRU already has a dedicated test suite in /tools/selftests/mm. + * - Legacy states (FP and SSE) are excluded, as they are not considered + * part of extended states (xstates) and their usage is already deeply + * integrated into user-space libraries. + */ +#define XFEATURE_MASK_TEST_SUPPORTED \ + ((1 << XFEATURE_YMM) | \ + (1 << XFEATURE_OPMASK) | \ + (1 << XFEATURE_ZMM_Hi256) | \ + (1 << XFEATURE_Hi16_ZMM) | \ + (1 << XFEATURE_XTILEDATA)) + +static inline uint64_t xgetbv(uint32_t index) +{ + uint32_t eax, edx; + + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); + return eax + ((uint64_t)edx << 32); +} + +static inline uint64_t get_xstatebv(struct xsave_buffer *xbuf) +{ + return *(uint64_t *)(&xbuf->header); +} + +static struct xstate_info xstate; + +struct futex_info { + unsigned int iterations; + struct futex_info *next; + pthread_mutex_t mutex; + pthread_t thread; + bool valid; + int nr; +}; + +static inline void load_rand_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + set_xstatebv(xbuf, xstate->mask); + set_rand_data(xstate, xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void load_init_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void copy_xstate(struct xsave_buffer *xbuf_dst, struct xsave_buffer *xbuf_src) +{ + memcpy(&xbuf_dst->bytes[xstate.xbuf_offset], + &xbuf_src->bytes[xstate.xbuf_offset], + xstate.size); +} + +static inline bool validate_xstate_same(struct xsave_buffer *xbuf1, struct xsave_buffer *xbuf2) +{ + int ret; + + ret = memcmp(&xbuf1->bytes[xstate.xbuf_offset], + &xbuf2->bytes[xstate.xbuf_offset], + xstate.size); + return ret == 0; +} + +static inline bool validate_xregs_same(struct xsave_buffer *xbuf1) +{ + struct xsave_buffer *xbuf2; + bool ret; + + xbuf2 = alloc_xbuf(); + if (!xbuf2) + ksft_exit_fail_msg("failed to allocate XSAVE buffer\n"); + + xsave(xbuf2, xstate.mask); + ret = validate_xstate_same(xbuf1, xbuf2); + + free(xbuf2); + return ret; +} + +/* Context switching test */ + +static void *check_xstate(void *info) +{ + struct futex_info *finfo = (struct futex_info *)info; + struct xsave_buffer *xbuf; + int i; + + xbuf = alloc_xbuf(); + if (!xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + /* + * Load random data into 'xbuf' and then restore it to the xstate + * registers. + */ + load_rand_xstate(&xstate, xbuf); + finfo->valid = true; + + for (i = 0; i < finfo->iterations; i++) { + pthread_mutex_lock(&finfo->mutex); + + /* + * Ensure the register values have not diverged from the + * record. Then reload a new random value. If it failed + * ever before, skip it. + */ + if (finfo->valid) { + finfo->valid = validate_xregs_same(xbuf); + load_rand_xstate(&xstate, xbuf); + } + + /* + * The last thread's last unlock will be for thread 0's + * mutex. However, thread 0 will have already exited the + * loop and the mutex will already be unlocked. + * + * Because this is not an ERRORCHECK mutex, that + * inconsistency will be silently ignored. + */ + pthread_mutex_unlock(&finfo->next->mutex); + } + + free(xbuf); + return finfo; +} + +static void create_threads(uint32_t num_threads, uint32_t iterations, struct futex_info *finfo) +{ + int i; + + for (i = 0; i < num_threads; i++) { + int next_nr; + + finfo[i].nr = i; + finfo[i].iterations = iterations; + + /* + * Thread 'i' will wait on this mutex to be unlocked. + * Lock it immediately after initialization: + */ + pthread_mutex_init(&finfo[i].mutex, NULL); + pthread_mutex_lock(&finfo[i].mutex); + + next_nr = (i + 1) % num_threads; + finfo[i].next = &finfo[next_nr]; + + if (pthread_create(&finfo[i].thread, NULL, check_xstate, &finfo[i])) + ksft_exit_fail_msg("pthread_create() failed\n"); + } +} + +static bool checkout_threads(uint32_t num_threads, struct futex_info *finfo) +{ + void *thread_retval; + bool valid = true; + int err, i; + + for (i = 0; i < num_threads; i++) { + err = pthread_join(finfo[i].thread, &thread_retval); + if (err) + ksft_exit_fail_msg("pthread_join() failed for thread %d err: %d\n", i, err); + + if (thread_retval != &finfo[i]) { + ksft_exit_fail_msg("unexpected thread retval for thread %d: %p\n", + i, thread_retval); + } + + valid &= finfo[i].valid; + } + + return valid; +} + +static void affinitize_cpu0(void) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + ksft_exit_fail_msg("sched_setaffinity to CPU 0 failed\n"); +} + +static void test_context_switch(uint32_t num_threads, uint32_t iterations) +{ + struct futex_info *finfo; + + /* Affinitize to one CPU to force context switches */ + affinitize_cpu0(); + + printf("[RUN]\t%s: check context switches, %d iterations, %d threads.\n", + xstate.name, iterations, num_threads); + + finfo = malloc(sizeof(*finfo) * num_threads); + if (!finfo) + ksft_exit_fail_msg("unable allocate memory\n"); + + create_threads(num_threads, iterations, finfo); + + /* + * This thread wakes up thread 0 + * Thread 0 will wake up 1 + * Thread 1 will wake up 2 + * ... + * The last thread will wake up 0 + * + * This will repeat for the configured + * number of iterations. + */ + pthread_mutex_unlock(&finfo[0].mutex); + + /* Wait for all the threads to finish: */ + if (checkout_threads(num_threads, finfo)) + printf("[OK]\tNo incorrect case was found.\n"); + else + printf("[FAIL]\tFailed with context switching test.\n"); + + free(finfo); +} + +/* + * Ptrace test for the ABI format as described in arch/x86/include/asm/user.h + */ + +/* + * Make sure the ptracee has the expanded kernel buffer on the first use. + * Then, initialize the state before performing the state injection from + * the ptracer. For non-dynamic states, this is benign. + */ +static inline void ptracee_touch_xstate(void) +{ + struct xsave_buffer *xbuf; + + xbuf = alloc_xbuf(); + + load_rand_xstate(&xstate, xbuf); + load_init_xstate(&xstate, xbuf); + + free(xbuf); +} + +/* + * Ptracer injects the randomized xstate data. It also reads before and + * after that, which will execute the kernel's state copy functions. + */ +static void ptracer_inject_xstate(pid_t target) +{ + uint32_t xbuf_size = get_xbuf_size(); + struct xsave_buffer *xbuf1, *xbuf2; + struct iovec iov; + + /* + * Allocate buffers to keep data while ptracer can write the + * other buffer + */ + xbuf1 = alloc_xbuf(); + xbuf2 = alloc_xbuf(); + if (!xbuf1 || !xbuf2) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + iov.iov_base = xbuf1; + iov.iov_len = xbuf_size; + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + printf("[RUN]\t%s: inject xstate via ptrace().\n", xstate.name); + + load_rand_xstate(&xstate, xbuf1); + copy_xstate(xbuf2, xbuf1); + + if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_SETREGSET failed\n"); + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + if (*(uint64_t *)get_fpx_sw_bytes(xbuf1) == xgetbv(0)) + printf("[OK]\t'xfeatures' in SW reserved area was correctly written\n"); + else + printf("[FAIL]\t'xfeatures' in SW reserved area was not correctly written\n"); + + if (validate_xstate_same(xbuf2, xbuf1)) + printf("[OK]\txstate was correctly updated.\n"); + else + printf("[FAIL]\txstate was not correctly updated.\n"); + + free(xbuf1); + free(xbuf2); +} + +static void test_ptrace(void) +{ + pid_t child; + int status; + + child = fork(); + if (child < 0) { + ksft_exit_fail_msg("fork() failed\n"); + } else if (!child) { + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) + ksft_exit_fail_msg("PTRACE_TRACEME failed\n"); + + ptracee_touch_xstate(); + + raise(SIGTRAP); + _exit(0); + } + + do { + wait(&status); + } while (WSTOPSIG(status) != SIGTRAP); + + ptracer_inject_xstate(child); + + ptrace(PTRACE_DETACH, child, NULL, NULL); + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + ksft_exit_fail_msg("ptracee exit error\n"); +} + +/* + * Test signal delivery for the ABI compatibility. + * See the ABI format: arch/x86/include/uapi/asm/sigcontext.h + */ + +/* + * Avoid using printf() in signal handlers as it is not + * async-signal-safe. + */ +#define SIGNAL_BUF_LEN 1000 +static char signal_message_buffer[SIGNAL_BUF_LEN]; +static void sig_print(char *msg) +{ + int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1; + + strncat(signal_message_buffer, msg, left); +} + +static struct xsave_buffer *stashed_xbuf; + +static void validate_sigfpstate(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + void *xbuf = ctx->uc_mcontext.fpregs; + struct _fpx_sw_bytes *sw_bytes; + uint32_t magic2; + + /* Reset the signal message buffer: */ + signal_message_buffer[0] = '\0'; + + sw_bytes = get_fpx_sw_bytes(xbuf); + if (sw_bytes->magic1 == FP_XSTATE_MAGIC1) + sig_print("[OK]\t'magic1' is valid\n"); + else + sig_print("[FAIL]\t'magic1' is not valid\n"); + + if (get_fpx_sw_bytes_features(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in SW reserved area is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in SW reserved area is not valid\n"); + + if (get_xstatebv(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in XSAVE header is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in XSAVE header is not valid\n"); + + if (validate_xstate_same(stashed_xbuf, xbuf)) + sig_print("[OK]\txstate delivery was successful\n"); + else + sig_print("[FAIL]\txstate delivery was not successful\n"); + + magic2 = *(uint32_t *)(xbuf + sw_bytes->xstate_size); + if (magic2 == FP_XSTATE_MAGIC2) + sig_print("[OK]\t'magic2' is valid\n"); + else + sig_print("[FAIL]\t'magic2' is not valid\n"); + + set_rand_data(&xstate, xbuf); + copy_xstate(stashed_xbuf, xbuf); +} + +static void test_signal(void) +{ + bool valid_xstate; + + /* + * The signal handler will access this to verify xstate context + * preservation. + */ + stashed_xbuf = alloc_xbuf(); + if (!stashed_xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + printf("[RUN]\t%s: load xstate and raise SIGUSR1\n", xstate.name); + + sethandler(SIGUSR1, validate_sigfpstate, 0); + + load_rand_xstate(&xstate, stashed_xbuf); + + raise(SIGUSR1); + + /* + * Immediately record the test result, deferring printf() to + * prevent unintended state contamination by that. + */ + valid_xstate = validate_xregs_same(stashed_xbuf); + printf("%s", signal_message_buffer); + + printf("[RUN]\t%s: load new xstate from sighandler and check it after sigreturn\n", + xstate.name); + + if (valid_xstate) + printf("[OK]\txstate was restored correctly\n"); + else + printf("[FAIL]\txstate restoration failed\n"); + + clearhandler(SIGUSR1); + free(stashed_xbuf); +} + +void test_xstate(uint32_t feature_num) +{ + const unsigned int ctxtsw_num_threads = 5, ctxtsw_iterations = 10; + unsigned long features; + long rc; + + if (!(XFEATURE_MASK_TEST_SUPPORTED & (1 << feature_num))) { + ksft_print_msg("The xstate test does not fully support the component %u, yet.\n", + feature_num); + return; + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features); + if (rc || !(features & (1 << feature_num))) { + ksft_print_msg("The kernel does not support feature number: %u\n", feature_num); + return; + } + + xstate = get_xstate_info(feature_num); + if (!xstate.size || !xstate.xbuf_offset) { + ksft_exit_fail_msg("invalid state size/offset (%d/%d)\n", + xstate.size, xstate.xbuf_offset); + } + + test_context_switch(ctxtsw_num_threads, ctxtsw_iterations); + test_ptrace(); + test_signal(); +} diff --git a/tools/testing/selftests/x86/xstate.h b/tools/testing/selftests/x86/xstate.h new file mode 100644 index 0000000000000..42af36ec852f4 --- /dev/null +++ b/tools/testing/selftests/x86/xstate.h @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __SELFTESTS_X86_XSTATE_H +#define __SELFTESTS_X86_XSTATE_H + +#include + +#include "../kselftest.h" + +#define XSAVE_HDR_OFFSET 512 +#define XSAVE_HDR_SIZE 64 + +/* + * List of XSAVE features Linux knows about. Copied from + * arch/x86/include/asm/fpu/types.h + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + XFEATURE_PT_UNIMPLEMENTED_SO_FAR, + XFEATURE_PKRU, + XFEATURE_PASID, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILECFG, + XFEATURE_XTILEDATA, + + XFEATURE_MAX, +}; + +/* Copied from arch/x86/kernel/fpu/xstate.c */ +static const char *xfeature_names[] = +{ + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", + "Protection Keys User registers", + "PASID state", + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", +}; + +struct xsave_buffer { + union { + struct { + char legacy[XSAVE_HDR_OFFSET]; + char header[XSAVE_HDR_SIZE]; + char extended[0]; + }; + char bytes[0]; + }; +}; + +static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xsave (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) + : "memory"); +} + +static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xrstor (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); +} + +#define CPUID_LEAF_XSTATE 0xd +#define CPUID_SUBLEAF_XSTATE_USER 0x0 + +static inline uint32_t get_xbuf_size(void) +{ + uint32_t eax, ebx, ecx, edx; + + __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, + eax, ebx, ecx, edx); + + /* + * EBX enumerates the size (in bytes) required by the XSAVE + * instruction for an XSAVE area containing all the user state + * components corresponding to bits currently set in XCR0. + */ + return ebx; +} + +struct xstate_info { + const char *name; + uint32_t num; + uint32_t mask; + uint32_t xbuf_offset; + uint32_t size; +}; + +static inline struct xstate_info get_xstate_info(uint32_t xfeature_num) +{ + struct xstate_info xstate = { }; + uint32_t eax, ebx, ecx, edx; + + if (xfeature_num >= XFEATURE_MAX) { + ksft_print_msg("unknown state\n"); + return xstate; + } + + xstate.name = xfeature_names[xfeature_num]; + xstate.num = xfeature_num; + xstate.mask = 1 << xfeature_num; + + __cpuid_count(CPUID_LEAF_XSTATE, xfeature_num, + eax, ebx, ecx, edx); + xstate.size = eax; + xstate.xbuf_offset = ebx; + return xstate; +} + +static inline struct xsave_buffer *alloc_xbuf(void) +{ + uint32_t xbuf_size = get_xbuf_size(); + + /* XSAVE buffer should be 64B-aligned. */ + return aligned_alloc(64, xbuf_size); +} + +static inline void clear_xstate_header(struct xsave_buffer *xbuf) +{ + memset(&xbuf->header, 0, sizeof(xbuf->header)); +} + +static inline void set_xstatebv(struct xsave_buffer *xbuf, uint64_t bv) +{ + /* XSTATE_BV is at the beginning of the header: */ + *(uint64_t *)(&xbuf->header) = bv; +} + +/* See 'struct _fpx_sw_bytes' at sigcontext.h */ +#define SW_BYTES_OFFSET 464 +/* N.B. The struct's field name varies so read from the offset. */ +#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) + +static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *xbuf) +{ + return xbuf + SW_BYTES_OFFSET; +} + +static inline uint64_t get_fpx_sw_bytes_features(void *buffer) +{ + return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); +} + +static inline void set_rand_data(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + int *ptr = (int *)&xbuf->bytes[xstate->xbuf_offset]; + int data, i; + + /* + * Ensure that 'data' is never 0. This ensures that + * the registers are never in their initial configuration + * and thus never tracked as being in the init state. + */ + data = rand() | 1; + + for (i = 0; i < xstate->size / sizeof(int); i++, ptr++) + *ptr = data; +} + +/* Testing kernel's context switching and ABI support for the xstate. */ +void test_xstate(uint32_t feature_num); + +#endif /* __SELFTESTS_X86_XSTATE_H */ diff --git a/tools/thermal/lib/Makefile b/tools/thermal/lib/Makefile index f2552f73a64c7..056d212f25cf5 100644 --- a/tools/thermal/lib/Makefile +++ b/tools/thermal/lib/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Set compile option CFLAGS ifdef EXTRA_CFLAGS CFLAGS := $(EXTRA_CFLAGS) diff --git a/tools/tracing/latency/Makefile b/tools/tracing/latency/Makefile index 6518b03e05c71..257a56b1899f2 100644 --- a/tools/tracing/latency/Makefile +++ b/tools/tracing/latency/Makefile @@ -37,12 +37,6 @@ FEATURE_TESTS += libtracefs FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(LATENCY-COLLECTOR) include $(srctree)/tools/build/Makefile.include diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 8b5101457c70b..0b61208db604e 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -37,12 +37,6 @@ FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs FEATURE_DISPLAY += libcpupower -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(RTLA) include $(srctree)/tools/build/Makefile.include diff --git a/tools/verification/rv/Makefile b/tools/verification/rv/Makefile index 411d62b3d8eb9..5b898360ba481 100644 --- a/tools/verification/rv/Makefile +++ b/tools/verification/rv/Makefile @@ -35,12 +35,6 @@ FEATURE_TESTS += libtracefs FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(RV) include $(srctree)/tools/build/Makefile.include diff --git a/usr/include/Makefile b/usr/include/Makefile index 6c6de1b1622b1..e3d6b03527fec 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -10,7 +10,7 @@ UAPI_CFLAGS := -std=c90 -Wall -Werror=implicit-function-declaration # In theory, we do not care -m32 or -m64 for header compile tests. # It is here just because CONFIG_CC_CAN_LINK is tested with -m32 or -m64. -UAPI_CFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) +UAPI_CFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) # USERCFLAGS might contain sysroot location for CC. UAPI_CFLAGS += $(USERCFLAGS)